aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.ci/generate_test_report_github.py15
-rw-r--r--.ci/generate_test_report_lib.py73
-rw-r--r--.ci/generate_test_report_lib_test.py154
-rwxr-xr-x.ci/monolithic-windows.sh3
-rw-r--r--.github/renovate.json9
-rw-r--r--.github/workflows/build-ci-container-tooling.yml97
-rw-r--r--.github/workflows/build-ci-container-windows.yml4
-rw-r--r--.github/workflows/build-ci-container.yml107
-rw-r--r--.github/workflows/build-container/action.yml95
-rw-r--r--.github/workflows/build-metrics-container.yml4
-rw-r--r--.github/workflows/ci-post-commit-analyzer.yml2
-rw-r--r--.github/workflows/commit-access-review.yml2
-rw-r--r--.github/workflows/containers/github-action-ci-tooling/Dockerfile3
-rw-r--r--.github/workflows/docs.yml2
-rw-r--r--.github/workflows/email-check.yaml2
-rw-r--r--.github/workflows/libclang-abi-tests.yml8
-rw-r--r--.github/workflows/libcxx-build-and-test.yaml8
-rw-r--r--.github/workflows/llvm-abi-tests.yml12
-rw-r--r--.github/workflows/pr-code-format.yml2
-rw-r--r--.github/workflows/pr-code-lint.yml2
-rw-r--r--.github/workflows/pr-request-release-note.yml2
-rw-r--r--.github/workflows/premerge.yaml4
-rw-r--r--.github/workflows/push-container/action.yml44
-rw-r--r--.github/workflows/release-binaries.yml6
-rw-r--r--.github/workflows/release-documentation.yml2
-rw-r--r--.github/workflows/release-sources.yml2
-rw-r--r--.github/workflows/scorecard.yml2
-rw-r--r--bolt/lib/Core/BinaryContext.cpp8
-rw-r--r--clang/cmake/caches/Fuchsia-stage2.cmake5
-rw-r--r--clang/docs/ReleaseNotes.rst2
-rw-r--r--clang/include/clang/AST/JSONNodeDumper.h2
-rw-r--r--clang/include/clang/Basic/DiagnosticDriverKinds.td2
-rw-r--r--clang/include/clang/Sema/Sema.h131
-rw-r--r--clang/lib/AST/ExprConstant.cpp536
-rw-r--r--clang/lib/AST/JSONNodeDumper.cpp21
-rw-r--r--clang/lib/AST/TextNodeDumper.cpp14
-rw-r--r--clang/lib/Analysis/ExprMutationAnalyzer.cpp10
-rw-r--r--clang/lib/CodeGen/CodeGenPGO.cpp9
-rw-r--r--clang/lib/Driver/ToolChains/AMDGPU.cpp15
-rw-r--r--clang/lib/Driver/ToolChains/HLSL.cpp4
-rw-r--r--clang/lib/Driver/ToolChains/HLSL.h2
-rw-r--r--clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h72
-rw-r--r--clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h12
-rw-r--r--clang/lib/Headers/hlsl/hlsl_intrinsics.h61
-rw-r--r--clang/lib/Interpreter/Interpreter.cpp49
-rw-r--r--clang/lib/Parse/ParseExprCXX.cpp22
-rw-r--r--clang/lib/Sema/Sema.cpp42
-rw-r--r--clang/lib/Sema/SemaAMDGPU.cpp2
-rw-r--r--clang/lib/Sema/SemaConcept.cpp58
-rw-r--r--clang/lib/Sema/SemaDecl.cpp11
-rw-r--r--clang/lib/Sema/SemaExpr.cpp16
-rw-r--r--clang/lib/Sema/SemaTemplate.cpp30
-rw-r--r--clang/lib/Sema/SemaTemplateDeduction.cpp130
-rw-r--r--clang/lib/Sema/SemaTemplateDeductionGuide.cpp1
-rw-r--r--clang/lib/Sema/SemaTemplateInstantiate.cpp182
-rw-r--r--clang/lib/Sema/SemaTemplateInstantiateDecl.cpp9
-rw-r--r--clang/lib/Sema/SemaTemplateVariadic.cpp11
-rw-r--r--clang/lib/Sema/TreeTransform.h20
-rw-r--r--clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp3
-rw-r--r--clang/lib/Tooling/Syntax/TokenBufferTokenManager.cpp2
-rw-r--r--clang/test/C/C2y/n3525.c30
-rw-r--r--clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl163
-rw-r--r--clang/test/Driver/HLSL/wconversion.hlsl7
-rw-r--r--clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1250.bc0
-rw-r--r--clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1251.bc0
-rw-r--r--clang/test/Driver/amdgpu-openmp-sanitize-options.c10
-rw-r--r--clang/test/Driver/hip-sanitize-options.hip5
-rw-r--r--clang/test/Driver/rocm-device-libs.cl12
-rw-r--r--clang/test/Parser/lambda-misplaced-capture-default.cpp9
-rw-r--r--clang/test/Profile/Inputs/c-counter-overflows.proftext2
-rw-r--r--clang/test/Profile/Inputs/c-general.profdata.v12bin0 -> 2616 bytes
-rw-r--r--clang/test/Profile/Inputs/c-general.proftext12
-rw-r--r--clang/test/Profile/Inputs/c-unprofiled-blocks.proftext4
-rw-r--r--clang/test/Profile/Inputs/cxx-rangefor.proftext2
-rw-r--r--clang/test/Profile/Inputs/cxx-throws.proftext2
-rw-r--r--clang/test/Profile/Inputs/misexpect-switch-default.proftext2
-rw-r--r--clang/test/Profile/Inputs/misexpect-switch-nonconst.proftext2
-rw-r--r--clang/test/Profile/c-collision.c4
-rw-r--r--clang/test/Profile/c-general.c1
-rw-r--r--clang/test/SemaCXX/attr-mode-tmpl.cpp2
-rw-r--r--clang/test/SemaCXX/cxx23-assume.cpp13
-rw-r--r--clang/test/SemaCXX/cxx2b-warn-shadow.cpp26
-rw-r--r--clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl2
-rw-r--r--clang/test/SemaHLSL/Types/AggregateSplatConstantExpr.hlsl89
-rw-r--r--clang/test/SemaHLSL/Types/ElementwiseCastConstantExpr.hlsl90
-rw-r--r--clang/test/SemaTemplate/temp_arg_nontype.cpp3
-rw-r--r--clang/test/SemaTemplate/temp_arg_nontype_cxx11.cpp2
-rw-r--r--clang/tools/clang-repl/ClangRepl.cpp1
-rw-r--r--clang/www/c_status.html2
-rw-r--r--compiler-rt/include/profile/InstrProfData.inc2
-rw-r--r--compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp78
-rw-r--r--compiler-rt/test/asan/TestCases/Darwin/asan-verify-module-map.cpp25
-rw-r--r--flang/include/flang/Optimizer/Builder/HLFIRTools.h35
-rw-r--r--flang/include/flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h51
-rw-r--r--flang/include/flang/Optimizer/OpenACC/Passes.h4
-rw-r--r--flang/include/flang/Optimizer/OpenACC/Passes.td16
-rw-r--r--flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h57
-rw-r--r--flang/lib/Frontend/CompilerInvocation.cpp12
-rw-r--r--flang/lib/Lower/OpenACC.cpp574
-rw-r--r--flang/lib/Optimizer/Builder/HLFIRTools.cpp61
-rw-r--r--flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp22
-rw-r--r--flang/lib/Optimizer/OpenACC/Analysis/CMakeLists.txt22
-rw-r--r--flang/lib/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.cpp40
-rw-r--r--flang/lib/Optimizer/OpenACC/CMakeLists.txt1
-rw-r--r--flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt1
-rw-r--r--flang/lib/Optimizer/OpenACC/Support/FIROpenACCUtils.cpp269
-rw-r--r--flang/lib/Optimizer/OpenACC/Transforms/ACCInitializeFIRAnalyses.cpp56
-rw-r--r--flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt4
-rw-r--r--flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp25
-rw-r--r--flang/test/Driver/multiple-actions-error.f9538
-rw-r--r--flang/test/Integration/debug-proc-ptr-e2e.f9026
-rw-r--r--flang/test/Lower/OpenACC/acc-private.f9053
-rw-r--r--flang/test/Lower/OpenACC/acc-reduction.f901957
-rw-r--r--flang/test/Transforms/OpenACC/acc-implicit-copy-reduction.fir134
-rw-r--r--flang/test/Transforms/OpenACC/acc-implicit-data-derived-type-member.F9038
-rw-r--r--flang/test/Transforms/OpenACC/acc-implicit-data-fortran.F9079
-rw-r--r--flang/test/Transforms/OpenACC/acc-implicit-data.fir358
-rw-r--r--flang/test/Transforms/OpenACC/acc-implicit-firstprivate.fir284
-rw-r--r--flang/test/Transforms/debug-proc-ptr.fir41
-rw-r--r--libc/src/string/string_utils.h18
-rw-r--r--libc/test/src/stdlib/CMakeLists.txt1
-rw-r--r--libc/test/src/stdlib/StrfromTest.h4
-rw-r--r--libc/test/src/string/memchr_test.cpp5
-rw-r--r--libcxx/include/CMakeLists.txt1
-rw-r--r--libcxx/include/__chrono/is_clock.h72
-rw-r--r--libcxx/include/chrono4
-rw-r--r--libcxx/include/module.modulemap.in4
-rw-r--r--libcxx/modules/std/chrono.inc4
-rw-r--r--libcxx/test/libcxx/time/time.traits.is.clock/trait.is.clock.compile.verify.cpp24
-rw-r--r--libcxx/test/std/time/time.traits.is.clock/trait.is.clock.compile.pass.cpp225
-rw-r--r--lld/ELF/SyntheticSections.cpp2
-rw-r--r--lldb/bindings/python/CMakeLists.txt1
-rw-r--r--lldb/bindings/python/python-swigsafecast.swig5
-rw-r--r--lldb/bindings/python/python-wrapper.swig12
-rw-r--r--lldb/examples/python/templates/scripted_frame_provider.py113
-rw-r--r--lldb/include/lldb/API/SBFrameList.h14
-rw-r--r--lldb/include/lldb/API/SBModuleSpec.h10
-rw-r--r--lldb/include/lldb/API/SBTarget.h1
-rw-r--r--lldb/include/lldb/Core/ModuleList.h4
-rw-r--r--lldb/include/lldb/Core/ModuleSpec.h18
-rw-r--r--lldb/include/lldb/Core/PluginManager.h18
-rw-r--r--lldb/include/lldb/Core/Section.h3
-rw-r--r--lldb/include/lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h30
-rw-r--r--lldb/include/lldb/Interpreter/ScriptInterpreter.h10
-rw-r--r--lldb/include/lldb/Symbol/ObjectFile.h6
-rw-r--r--lldb/include/lldb/Target/Platform.h16
-rw-r--r--lldb/include/lldb/Target/RemoteAwarePlatform.h6
-rw-r--r--lldb/include/lldb/Target/SyntheticFrameProvider.h156
-rw-r--r--lldb/include/lldb/lldb-forward.h6
-rw-r--r--lldb/include/lldb/lldb-private-interfaces.h9
-rw-r--r--lldb/packages/Python/lldbsuite/test/make/Makefile.rules5
-rw-r--r--lldb/source/API/SBModule.cpp4
-rw-r--r--lldb/source/API/SBModuleSpec.cpp13
-rw-r--r--lldb/source/Core/DynamicLoader.cpp5
-rw-r--r--lldb/source/Core/ModuleList.cpp34
-rw-r--r--lldb/source/Core/PluginManager.cpp55
-rw-r--r--lldb/source/Core/Section.cpp4
-rw-r--r--lldb/source/Interpreter/ScriptInterpreter.cpp5
-rw-r--r--lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp6
-rw-r--r--lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp5
-rw-r--r--lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.h8
-rw-r--r--lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp14
-rw-r--r--lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h2
-rw-r--r--lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp10
-rw-r--r--lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h1
-rw-r--r--lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp47
-rw-r--r--lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h3
-rw-r--r--lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp10
-rw-r--r--lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.h1
-rw-r--r--lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp22
-rw-r--r--lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h11
-rw-r--r--lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp9
-rw-r--r--lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.h1
-rw-r--r--lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp20
-rw-r--r--lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h1
-rw-r--r--lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp3
-rw-r--r--lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp3
-rw-r--r--lldb/source/Plugins/Process/scripted/ScriptedFrame.h1
-rw-r--r--lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt1
-rw-r--r--lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.h1
-rw-r--r--lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp57
-rw-r--r--lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.h44
-rw-r--r--lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp17
-rw-r--r--lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h13
-rw-r--r--lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h2
-rw-r--r--lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp5
-rw-r--r--lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h3
-rw-r--r--lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp2
-rw-r--r--lldb/source/Target/CMakeLists.txt1
-rw-r--r--lldb/source/Target/ModuleCache.cpp2
-rw-r--r--lldb/source/Target/Platform.cpp44
-rw-r--r--lldb/source/Target/RemoteAwarePlatform.cpp11
-rw-r--r--lldb/source/Target/SyntheticFrameProvider.cpp100
-rw-r--r--lldb/source/Target/Target.cpp18
-rw-r--r--lldb/source/Target/TargetList.cpp8
-rw-r--r--lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py2
-rw-r--r--lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multimap/TestDataFormatterGenericMultiMap.py2
-rw-r--r--lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multiset/TestDataFormatterGenericMultiSet.py2
-rw-r--r--lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/set/TestDataFormatterGenericSet.py2
-rw-r--r--lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/TestDataFormatterStdTuple.py2
-rw-r--r--lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py2
-rw-r--r--lldb/test/Shell/Commands/Inputs/sigchld.c4
-rw-r--r--lldb/test/Shell/Commands/command-list-reach-beginning-of-file.test4
-rw-r--r--lldb/tools/debugserver/source/MacOSX/MachProcess.mm6
-rw-r--r--lldb/unittests/Core/CMakeLists.txt1
-rw-r--r--lldb/unittests/Core/ModuleListTest.cpp178
-rw-r--r--lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp10
-rw-r--r--lldb/unittests/Target/LocateModuleCallbackTest.cpp20
-rw-r--r--lldb/unittests/Target/RemoteAwarePlatformTest.cpp17
-rw-r--r--llvm/docs/Extensions.rst18
-rw-r--r--llvm/docs/LangRef.rst31
-rw-r--r--llvm/docs/ReleaseNotes.md3
-rw-r--r--llvm/include/llvm/Analysis/RegionPrinter.h104
-rw-r--r--llvm/include/llvm/CodeGen/Analysis.h16
-rw-r--r--llvm/include/llvm/CodeGen/ISDOpcodes.h3
-rw-r--r--llvm/include/llvm/CodeGen/LibcallLoweringInfo.h9
-rw-r--r--llvm/include/llvm/CodeGen/SelectionDAGISel.h1
-rw-r--r--llvm/include/llvm/IR/Intrinsics.td3
-rw-r--r--llvm/include/llvm/IR/PatternMatch.h22
-rw-r--r--llvm/include/llvm/ProfileData/InstrProf.h10
-rw-r--r--llvm/include/llvm/ProfileData/InstrProfData.inc2
-rw-r--r--llvm/include/llvm/Support/Casting.h16
-rw-r--r--llvm/include/llvm/Support/TargetOpcodes.def3
-rw-r--r--llvm/include/llvm/Support/thread.h4
-rw-r--r--llvm/include/llvm/Target/Target.td5
-rw-r--r--llvm/include/llvm/TargetParser/TargetParser.h3
-rw-r--r--llvm/lib/Analysis/DependenceAnalysis.cpp19
-rw-r--r--llvm/lib/Analysis/RegionPrinter.cpp11
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp11
-rw-r--r--llvm/lib/CodeGen/GlobalISel/CallLowering.cpp7
-rw-r--r--llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp7
-rw-r--r--llvm/lib/CodeGen/MIRParser/MIParser.cpp2
-rw-r--r--llvm/lib/CodeGen/MachineInstr.cpp8
-rw-r--r--llvm/lib/CodeGen/MachineInstrBundle.cpp6
-rw-r--r--llvm/lib/CodeGen/SafeStack.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp15
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp8
-rw-r--r--llvm/lib/CodeGen/TwoAddressInstructionPass.cpp11
-rw-r--r--llvm/lib/IR/DebugInfoMetadata.cpp21
-rw-r--r--llvm/lib/IR/Verifier.cpp6
-rw-r--r--llvm/lib/ProfileData/InstrProf.cpp5
-rw-r--r--llvm/lib/ProfileData/InstrProfWriter.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp3
-rw-r--r--llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp1
-rw-r--r--llvm/lib/Target/BPF/BPFISelLowering.cpp20
-rw-r--r--llvm/lib/Target/BPF/BPFISelLowering.h14
-rw-r--r--llvm/lib/Target/BPF/BPFInstrInfo.td6
-rw-r--r--llvm/lib/Target/BPF/BPFPreserveDIType.cpp4
-rw-r--r--llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp18
-rw-r--r--llvm/lib/Target/BPF/BPFSelectionDAGInfo.h10
-rw-r--r--llvm/lib/Target/BPF/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/DirectX/DXILDataScalarization.cpp68
-rw-r--r--llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp4
-rw-r--r--llvm/lib/Target/DirectX/DXILOpLowering.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp13
-rw-r--r--llvm/lib/Target/Hexagon/HexagonInstrInfo.h1
-rw-r--r--llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp145
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp3
-rw-r--r--llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp4
-rw-r--r--llvm/lib/Target/RISCV/RISCVFrameLowering.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp45
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td66
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp5
-rw-r--r--llvm/lib/TargetParser/TargetParser.cpp4
-rw-r--r--llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp51
-rw-r--r--llvm/lib/Transforms/Utils/BypassSlowDivision.cpp32
-rw-r--r--llvm/lib/Transforms/Utils/LoopSimplify.cpp60
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp33
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp9
-rw-r--r--llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll2
-rw-r--r--llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll86
-rw-r--r--llvm/test/Analysis/DependenceAnalysis/same-sd-for-diff-becount-type-loops.ll15
-rw-r--r--llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll32
-rw-r--r--llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll7
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll7924
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll301
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll631
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll157
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll231
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll196
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll332
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll609
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll412
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll959
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll49
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-argument-types.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds_write2.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll63
-rw-r--r--llvm/test/CodeGen/AMDGPU/finalizebundle.mir52
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll100
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll21
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll156
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i1.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-global-i16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-global-i32.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-global-i8.ll15
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-local-i16.ll214
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll31
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll58
-rw-r--r--llvm/test/CodeGen/AMDGPU/max.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll1765
-rw-r--r--llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/packed-fp32.ll203
-rw-r--r--llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir5
-rw-r--r--llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll41
-rw-r--r--llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/scratch-simple.ll1728
-rw-r--r--llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll15
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-agpr.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/stack-realign.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir57
-rw-r--r--llvm/test/CodeGen/DirectX/llvm_assume.ll9
-rw-r--r--llvm/test/CodeGen/DirectX/scalarize-alloca.ll65
-rw-r--r--llvm/test/CodeGen/DirectX/scalarize-global.ll70
-rw-r--r--llvm/test/CodeGen/Generic/reloc-none.ll10
-rw-r--r--llvm/test/CodeGen/Hexagon/autohvx/xqf-fixup-qfp1.ll372
-rw-r--r--llvm/test/CodeGen/Hexagon/hvx-vsub-qf-sf-mix.ll60
-rw-r--r--llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll4
-rw-r--r--llvm/test/CodeGen/Hexagon/vect-qfp.mir202
-rw-r--r--llvm/test/CodeGen/Hexagon/vect/vect-qfp-unary.mir97
-rw-r--r--llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt1
-rw-r--r--llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt1
-rw-r--r--llvm/test/CodeGen/PowerPC/annotate-metadata.ll15
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll8
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll75
-rw-r--r--llvm/test/CodeGen/SystemZ/vec-load-element.ll4
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir45
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll14
-rw-r--r--llvm/test/CodeGen/X86/bittest-big-integer.ll59
-rw-r--r--llvm/test/DebugInfo/extradata-node-reference.ll101
-rw-r--r--llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td2
-rw-r--r--llvm/test/TableGen/RegClassByHwMode.td13
-rw-r--r--llvm/test/TableGen/get-named-operand-idx.td19
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll25
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/copyable-child-node-used-outside.ll37
-rw-r--r--llvm/test/Verifier/reloc-none.ll13
-rw-r--r--llvm/test/tools/dxil-dis/llvm_assume.ll11
-rw-r--r--llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt52
-rw-r--r--llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt11239
-rw-r--r--llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s1118
-rw-r--r--llvm/test/tools/llvm-profdata/profile-version.test2
-rw-r--r--llvm/unittests/IR/PatternMatch.cpp27
-rw-r--r--llvm/unittests/Transforms/Vectorize/VPlanTest.cpp196
-rw-r--r--llvm/utils/TableGen/InstrInfoEmitter.cpp1093
-rw-r--r--llvm/utils/gn/secondary/libcxx/include/BUILD.gn1
-rw-r--r--llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn1
-rw-r--r--mlir/cmake/modules/AddMLIRPython.cmake4
-rw-r--r--mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td2
-rw-r--r--mlir/include/mlir/Dialect/MemRef/IR/MemRef.h1
-rw-r--r--mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td18
-rw-r--r--mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td2
-rw-r--r--mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.h3
-rw-r--r--mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td36
-rw-r--r--mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCooperativeMatrixOps.td6
-rw-r--r--mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td8
-rw-r--r--mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.h1
-rw-r--r--mlir/include/mlir/Dialect/Vector/IR/VectorOps.h1
-rw-r--r--mlir/include/mlir/Dialect/Vector/IR/VectorOps.td35
-rw-r--r--mlir/include/mlir/Interfaces/AlignmentAttrInterface.h21
-rw-r--r--mlir/include/mlir/Interfaces/AlignmentAttrInterface.td65
-rw-r--r--mlir/include/mlir/Interfaces/CMakeLists.txt1
-rw-r--r--mlir/include/mlir/TableGen/CodeGenHelpers.h24
-rw-r--r--mlir/lib/Dialect/Async/IR/Async.cpp2
-rw-r--r--mlir/lib/Dialect/DLTI/DLTI.cpp5
-rw-r--r--mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp2
-rw-r--r--mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp68
-rw-r--r--mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitData.cpp880
-rw-r--r--mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt4
-rw-r--r--mlir/lib/Dialect/Transform/Interfaces/TransformInterfaces.cpp2
-rw-r--r--mlir/lib/IR/BuiltinTypeInterfaces.cpp2
-rw-r--r--mlir/lib/IR/Operation.cpp3
-rw-r--r--mlir/lib/Interfaces/AlignmentAttrInterface.cpp13
-rw-r--r--mlir/lib/Interfaces/CMakeLists.txt2
-rw-r--r--mlir/lib/TableGen/CodeGenHelpers.cpp90
-rw-r--r--mlir/test/Dialect/MemRef/invalid.mlir16
-rw-r--r--mlir/test/Dialect/OpenACC/acc-implicit-data-reduction.mlir109
-rw-r--r--mlir/test/Dialect/OpenACC/acc-implicit-data.mlir224
-rw-r--r--mlir/test/Dialect/OpenACC/canonicalize.mlir27
-rw-r--r--mlir/test/mlir-tblgen/constraint-unique.td10
-rw-r--r--mlir/test/mlir-tblgen/op-attribute.td16
-rw-r--r--mlir/test/mlir-tblgen/op-properties-predicates.td2
-rw-r--r--mlir/test/mlir-tblgen/predicate.td16
-rw-r--r--mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp38
-rw-r--r--mlir/unittests/TableGen/CMakeLists.txt2
-rw-r--r--offload/include/Shared/Environment.h22
-rw-r--r--offload/plugins-nextgen/amdgpu/src/rtl.cpp14
-rw-r--r--offload/plugins-nextgen/common/include/PluginInterface.h24
-rw-r--r--offload/plugins-nextgen/common/src/PluginInterface.cpp102
-rw-r--r--offload/plugins-nextgen/cuda/src/rtl.cpp6
-rw-r--r--offload/plugins-nextgen/host/src/rtl.cpp8
-rw-r--r--offload/test/libc/malloc_parallel.c (renamed from offload/test/offloading/malloc_parallel.c)0
-rw-r--r--offload/test/mapping/lambda_mapping.cpp2
-rw-r--r--offload/test/offloading/interop-print.c1
-rw-r--r--offload/test/offloading/malloc.c2
-rw-r--r--openmp/device/include/Allocator.h6
-rw-r--r--openmp/device/src/Allocator.cpp67
-rw-r--r--openmp/device/src/Kernel.cpp1
-rw-r--r--openmp/device/src/Misc.cpp4
-rw-r--r--openmp/device/src/State.cpp24
-rw-r--r--openmp/docs/design/Runtimes.rst1
-rw-r--r--utils/bazel/llvm-project-overlay/mlir/BUILD.bazel41
419 files changed, 25171 insertions, 18348 deletions
diff --git a/.ci/generate_test_report_github.py b/.ci/generate_test_report_github.py
index 08387de..18c5e07 100644
--- a/.ci/generate_test_report_github.py
+++ b/.ci/generate_test_report_github.py
@@ -4,21 +4,10 @@
"""Script to generate a build report for Github."""
import argparse
-import platform
import generate_test_report_lib
-def compute_platform_title() -> str:
- logo = ":window:" if platform.system() == "Windows" else ":penguin:"
- # On Linux the machine value is x86_64 on Windows it is AMD64.
- if platform.machine() == "x86_64" or platform.machine() == "AMD64":
- arch = "x64"
- else:
- arch = platform.machine()
- return f"{logo} {platform.system()} {arch} Test Results"
-
-
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("return_code", help="The build's return code.", type=int)
@@ -28,7 +17,9 @@ if __name__ == "__main__":
args = parser.parse_args()
report = generate_test_report_lib.generate_report_from_files(
- compute_platform_title(), args.return_code, args.build_test_logs
+ generate_test_report_lib.compute_platform_title(),
+ args.return_code,
+ args.build_test_logs,
)
print(report)
diff --git a/.ci/generate_test_report_lib.py b/.ci/generate_test_report_lib.py
index 0c025c5..ce8262f 100644
--- a/.ci/generate_test_report_lib.py
+++ b/.ci/generate_test_report_lib.py
@@ -3,8 +3,22 @@
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
"""Library to parse JUnit XML files and return a markdown report."""
+from typing import TypedDict, Optional
+import platform
+
from junitparser import JUnitXml, Failure
+
+# This data structure should match the definition in llvm-zorg in
+# premerge/advisor/advisor_lib.py
+# TODO(boomanaiden154): Drop the Optional here and switch to str | None when
+# we require Python 3.10.
+class FailureExplanation(TypedDict):
+ name: str
+ explained: bool
+ reason: Optional[str]
+
+
SEE_BUILD_FILE_STR = "Download the build's log file to see the details."
UNRELATED_FAILURES_STR = (
"If these failures are unrelated to your changes (for example "
@@ -82,16 +96,29 @@ def find_failure_in_ninja_logs(ninja_logs: list[list[str]]) -> list[tuple[str, s
return failures
-def _format_ninja_failures(ninja_failures: list[tuple[str, str]]) -> list[str]:
- """Formats ninja failures into summary views for the report."""
+def _format_failures(
+ failures: list[tuple[str, str]], failure_explanations: dict[str, FailureExplanation]
+) -> list[str]:
+ """Formats failures into summary views for the report."""
output = []
- for build_failure in ninja_failures:
+ for build_failure in failures:
failed_action, failure_message = build_failure
+ failure_explanation = None
+ if failed_action in failure_explanations:
+ failure_explanation = failure_explanations[failed_action]
+ output.append("<details>")
+ if failure_explanation:
+ output.extend(
+ [
+ f"<summary>{failed_action} (Likely Already Failing)</summary>" "",
+ failure_explanation["reason"],
+ "",
+ ]
+ )
+ else:
+ output.extend([f"<summary>{failed_action}</summary>", ""])
output.extend(
[
- "<details>",
- f"<summary>{failed_action}</summary>",
- "",
"```",
failure_message,
"```",
@@ -132,12 +159,19 @@ def generate_report(
ninja_logs: list[list[str]],
size_limit=1024 * 1024,
list_failures=True,
+ failure_explanations_list: list[FailureExplanation] = [],
):
failures = get_failures(junit_objects)
tests_run = 0
tests_skipped = 0
tests_failed = 0
+ failure_explanations: dict[str, FailureExplanation] = {}
+ for failure_explanation in failure_explanations_list:
+ if not failure_explanation["explained"]:
+ continue
+ failure_explanations[failure_explanation["name"]] = failure_explanation
+
for results in junit_objects:
for testsuite in results:
tests_run += testsuite.tests
@@ -176,7 +210,7 @@ def generate_report(
"",
]
)
- report.extend(_format_ninja_failures(ninja_failures))
+ report.extend(_format_failures(ninja_failures, failure_explanations))
report.extend(
[
"",
@@ -212,18 +246,7 @@ def generate_report(
for testsuite_name, failures in failures.items():
report.extend(["", f"### {testsuite_name}"])
- for name, output in failures:
- report.extend(
- [
- "<details>",
- f"<summary>{name}</summary>",
- "",
- "```",
- output,
- "```",
- "</details>",
- ]
- )
+ report.extend(_format_failures(failures, failure_explanations))
elif return_code != 0:
# No tests failed but the build was in a failed state. Bring this to the user's
# attention.
@@ -248,7 +271,7 @@ def generate_report(
"",
]
)
- report.extend(_format_ninja_failures(ninja_failures))
+ report.extend(_format_failures(ninja_failures, failure_explanations))
if failures or return_code != 0:
report.extend(["", UNRELATED_FAILURES_STR])
@@ -285,3 +308,13 @@ def load_info_from_files(build_log_files):
def generate_report_from_files(title, return_code, build_log_files):
junit_objects, ninja_logs = load_info_from_files(build_log_files)
return generate_report(title, return_code, junit_objects, ninja_logs)
+
+
+def compute_platform_title() -> str:
+ logo = ":window:" if platform.system() == "Windows" else ":penguin:"
+ # On Linux the machine value is x86_64 on Windows it is AMD64.
+ if platform.machine() == "x86_64" or platform.machine() == "AMD64":
+ arch = "x64"
+ else:
+ arch = platform.machine()
+ return f"{logo} {platform.system()} {arch} Test Results"
diff --git a/.ci/generate_test_report_lib_test.py b/.ci/generate_test_report_lib_test.py
index 4068a3b7..341cf30 100644
--- a/.ci/generate_test_report_lib_test.py
+++ b/.ci/generate_test_report_lib_test.py
@@ -781,6 +781,160 @@ class TestReports(unittest.TestCase):
),
)
+ def test_report_ninja_explanation(self):
+ self.assertEqual(
+ generate_test_report_lib.generate_report(
+ "Foo",
+ 1,
+ [],
+ [
+ [
+ "[1/5] test/1.stamp",
+ "[2/5] test/2.stamp",
+ "[3/5] test/3.stamp",
+ "[4/5] test/4.stamp",
+ "FAILED: test/4.stamp",
+ "touch test/4.stamp",
+ "Half Moon Bay.",
+ "[5/5] test/5.stamp",
+ ]
+ ],
+ failure_explanations_list=[
+ {
+ "name": "test/4.stamp",
+ "explained": True,
+ "reason": "Failing at head",
+ }
+ ],
+ ),
+ dedent(
+ """\
+ # Foo
+
+ The build failed before running any tests. Click on a failure below to see the details.
+
+ <details>
+ <summary>test/4.stamp (Likely Already Failing)</summary>
+ Failing at head
+
+ ```
+ FAILED: test/4.stamp
+ touch test/4.stamp
+ Half Moon Bay.
+ ```
+ </details>
+
+ If these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the `infrastructure` label."""
+ ),
+ )
+
+ def test_report_test_failure_explanation(self):
+ self.assertEqual(
+ generate_test_report_lib.generate_report(
+ "Foo",
+ 1,
+ [
+ junit_from_xml(
+ dedent(
+ """\
+ <?xml version="1.0" encoding="UTF-8"?>
+ <testsuites time="8.89">
+ <testsuite name="Bar" tests="1" failures="1" skipped="0" time="410.63">
+ <testcase classname="Bar/test_3" name="test_3" time="0.02">
+ <failure><![CDATA[Error! Expected Big Sur to be next to the ocean.]]></failure>
+ </testcase>
+ </testsuite>
+ </testsuites>"""
+ )
+ )
+ ],
+ [],
+ failure_explanations_list=[
+ {
+ "name": "Bar/test_3/test_3",
+ "explained": True,
+ "reason": "Big Sur is next to the Pacific.",
+ }
+ ],
+ ),
+ (
+ dedent(
+ """\
+ # Foo
+
+ * 1 test failed
+
+ ## Failed Tests
+ (click on a test name to see its output)
+
+ ### Bar
+ <details>
+ <summary>Bar/test_3/test_3 (Likely Already Failing)</summary>
+ Big Sur is next to the Pacific.
+
+ ```
+ Error! Expected Big Sur to be next to the ocean.
+ ```
+ </details>
+
+ If these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the `infrastructure` label."""
+ )
+ ),
+ )
+
+ def test_report_test_failure_have_explanation_explained_false(self):
+ self.assertEqual(
+ generate_test_report_lib.generate_report(
+ "Foo",
+ 1,
+ [
+ junit_from_xml(
+ dedent(
+ """\
+ <?xml version="1.0" encoding="UTF-8"?>
+ <testsuites time="8.89">
+ <testsuite name="Bar" tests="1" failures="1" skipped="0" time="410.63">
+ <testcase classname="Bar/test_3" name="test_3" time="0.02">
+ <failure><![CDATA[Error! Expected Mt. Shasta to be next in the Eastern Sierras.]]></failure>
+ </testcase>
+ </testsuite>
+ </testsuites>"""
+ )
+ )
+ ],
+ [],
+ failure_explanations_list=[
+ {
+ "name": "Bar/test_3/test_3",
+ "explained": False,
+ "reason": "Mt. Shasta is in the Cascades",
+ }
+ ],
+ ),
+ (
+ dedent(
+ """\
+ # Foo
+
+ * 1 test failed
+
+ ## Failed Tests
+ (click on a test name to see its output)
+
+ ### Bar
+ <details>
+ <summary>Bar/test_3/test_3</summary>
+
+ ```
+ Error! Expected Mt. Shasta to be next in the Eastern Sierras.
+ ```
+ </details>
+
+ If these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the `infrastructure` label."""
+ )
+ ),
+ )
+
def test_generate_report_end_to_end(self):
with tempfile.TemporaryDirectory() as temp_dir:
junit_xml_file = os.path.join(temp_dir, "junit.xml")
diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index 5fb8f69..beaed71 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -32,8 +32,6 @@ export LD=link
# see https://github.com/llvm/llvm-project/pull/82393 and
# https://discourse.llvm.org/t/rfc-future-of-windows-pre-commit-ci/76840/40
# for further information.
-# We limit the number of parallel compile jobs to 24 control memory
-# consumption and improve build reliability.
cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
-D LLVM_ENABLE_PROJECTS="${projects}" \
-G Ninja \
@@ -49,7 +47,6 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
-D CMAKE_EXE_LINKER_FLAGS="/MANIFEST:NO" \
-D CMAKE_MODULE_LINKER_FLAGS="/MANIFEST:NO" \
-D CMAKE_SHARED_LINKER_FLAGS="/MANIFEST:NO" \
- -D CMAKE_CXX_FLAGS="-Wno-c++98-compat -Wno-c++14-compat -Wno-unsafe-buffer-usage -Wno-old-style-cast" \
-D LLVM_ENABLE_RUNTIMES="${runtimes}"
start-group "ninja"
diff --git a/.github/renovate.json b/.github/renovate.json
index 6ce98c4e..8e89ba8 100644
--- a/.github/renovate.json
+++ b/.github/renovate.json
@@ -8,5 +8,12 @@
"minimumReleaseAge": "3 days",
"assignees": ["boomanaiden154"],
"ignorePaths": [".github/workflows/containers/**"],
- "groupName": "[Github] Update GHA Dependencies"
+ "groupName": "[Github] Update GHA Dependencies",
+ "packageRules": [
+ {
+ "matchPackageNames": ["windows", "macos"],
+ "matchManagers": ["github-actions"],
+ "enabled": false
+ }
+ ]
}
diff --git a/.github/workflows/build-ci-container-tooling.yml b/.github/workflows/build-ci-container-tooling.yml
index 992947e..46dc38f 100644
--- a/.github/workflows/build-ci-container-tooling.yml
+++ b/.github/workflows/build-ci-container-tooling.yml
@@ -12,17 +12,30 @@ on:
- '.github/workflows/containers/github-action-ci-tooling/**'
- llvm/utils/git/requirements_formatting.txt
- llvm/utils/git/requirements_linting.txt
+ - '.github/workflows/build-container/**'
+ - '.github/workflows/push-container/**'
pull_request:
paths:
- .github/workflows/build-ci-container-tooling.yml
- '.github/workflows/containers/github-action-ci-tooling/**'
- llvm/utils/git/requirements_formatting.txt
- llvm/utils/git/requirements_linting.txt
+ - '.github/workflows/build-container/**'
+ - '.github/workflows/push-container/**'
jobs:
build-ci-container-tooling:
+ name: Build Container ${{ matrix.container-name }}
if: github.repository_owner == 'llvm'
runs-on: ubuntu-24.04
+ strategy:
+ fail-fast: false
+ matrix:
+ include:
+ - container-name: code-format
+ test-command: 'cd $HOME && clang-format --version | grep version && git-clang-format -h | grep usage && black --version | grep black'
+ - container-name: code-lint
+ test-command: 'cd $HOME && clang-tidy --version | grep version && clang-tidy-diff.py -h | grep usage'
steps:
- name: Checkout LLVM
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
@@ -32,48 +45,15 @@ jobs:
llvm/utils/git/requirements_formatting.txt
llvm/utils/git/requirements_linting.txt
clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py
+ .github/workflows/build-container
- - name: Write Variables
- id: vars
- run: |
- tag=$(git rev-parse --short=12 HEAD)
- container_name="ghcr.io/$GITHUB_REPOSITORY_OWNER/amd64/ci-ubuntu-24.04"
- echo "container-name-format=$container_name-code-format" >> $GITHUB_OUTPUT
- echo "container-name-lint=$container_name-code-lint" >> $GITHUB_OUTPUT
- echo "container-name-format-tag=$container_name-format:$tag" >> $GITHUB_OUTPUT
- echo "container-name-lint-tag=$container_name-lint:$tag" >> $GITHUB_OUTPUT
- echo "container-format-filename=$(echo $container_name-format:$tag | sed -e 's/\//-/g' -e 's/:/-/g').tar" >> $GITHUB_OUTPUT
- echo "container-lint-filename=$(echo $container_name-lint:$tag | sed -e 's/\//-/g' -e 's/:/-/g').tar" >> $GITHUB_OUTPUT
-
- - name: Build container
- run: |
- podman build --target ci-container-code-format \
- -f .github/workflows/containers/github-action-ci-tooling/Dockerfile \
- -t ${{ steps.vars.outputs.container-name-format-tag }} .
- podman build --target ci-container-code-lint \
- -f .github/workflows/containers/github-action-ci-tooling/Dockerfile \
- -t ${{ steps.vars.outputs.container-name-lint-tag }} .
-
- # Save the container so we have it in case the push fails. This also
- # allows us to separate the push step into a different job so we can
- # maintain minimal permissions while building the container.
- - name: Save container image
- run: |
- podman save ${{ steps.vars.outputs.container-name-format-tag }} > ${{ steps.vars.outputs.container-format-filename }}
- podman save ${{ steps.vars.outputs.container-name-lint-tag }} > ${{ steps.vars.outputs.container-lint-filename }}
-
- - name: Upload container image
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ - name: Build Container
+ uses: ./.github/workflows/build-container
with:
- name: container-amd64
- path: "*.tar"
- retention-days: 14
-
- - name: Test Container
- run: |
- # Use --pull=never to ensure we are testing the just built image.
- podman run --pull=never --rm -it ${{ steps.vars.outputs.container-name-format-tag }} /usr/bin/bash -x -c 'cd $HOME && clang-format --version | grep version && git-clang-format -h | grep usage && black --version | grep black'
- podman run --pull=never --rm -it ${{ steps.vars.outputs.container-name-lint-tag }} /usr/bin/bash -x -c 'cd $HOME && clang-tidy --version | grep version && clang-tidy-diff.py -h | grep usage'
+ container-name: ci-ubuntu-24.04-${{ matrix.container-name }}
+ dockerfile: .github/workflows/containers/github-action-ci-tooling/Dockerfile
+ target: ci-container-${{ matrix.container-name }}
+ test-command: ${{ matrix.test-command }}
push-ci-container:
if: github.event_name == 'push'
@@ -82,34 +62,13 @@ jobs:
permissions:
packages: write
runs-on: ubuntu-24.04
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
steps:
- - name: Download container
- uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
-
- - name: Push Container
- run: |
- function push_container {
- image_name=$1
- latest_name=$(echo $image_name | sed 's/:[a-f0-9]\+$/:latest/g')
- podman tag $image_name $latest_name
- echo "Pushing $image_name ..."
- podman push $image_name
- echo "Pushing $latest_name ..."
- podman push $latest_name
- }
-
- podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io
- for f in $(find . -iname '*.tar'); do
- image_name=$(podman load -q -i $f | sed 's/Loaded image: //g')
- push_container $image_name
+ - name: Checkout LLVM
+ uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+ with:
+ sparse-checkout: |
+ .github/workflows/push-container
- if echo $image_name | grep '/amd64/'; then
- # For amd64, create an alias with the arch component removed.
- # This matches the convention used on dockerhub.
- default_image_name=$(echo $(dirname $(dirname $image_name))/$(basename $image_name))
- podman tag $image_name $default_image_name
- push_container $default_image_name
- fi
- done
+ - uses: ./.github/workflows/push-container
+ with:
+ token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/build-ci-container-windows.yml b/.github/workflows/build-ci-container-windows.yml
index 14c349b..b6c46b70 100644
--- a/.github/workflows/build-ci-container-windows.yml
+++ b/.github/workflows/build-ci-container-windows.yml
@@ -44,7 +44,7 @@ jobs:
run: |
docker save ${{ steps.vars.outputs.container-name-tag }} > ${{ steps.vars.outputs.container-filename }}
- name: Upload container image
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
with:
name: container
path: ${{ steps.vars.outputs.container-filename }}
@@ -61,7 +61,7 @@ jobs:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
steps:
- name: Download container
- uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
+ uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
with:
name: container
- name: Push Container
diff --git a/.github/workflows/build-ci-container.yml b/.github/workflows/build-ci-container.yml
index 027c558..33b4dda 100644
--- a/.github/workflows/build-ci-container.yml
+++ b/.github/workflows/build-ci-container.yml
@@ -10,72 +10,46 @@ on:
paths:
- .github/workflows/build-ci-container.yml
- '.github/workflows/containers/github-action-ci/**'
+ - '.github/workflows/build-container/**'
+ - '.github/workflows/push-container/**'
pull_request:
paths:
- .github/workflows/build-ci-container.yml
- '.github/workflows/containers/github-action-ci/**'
+ - '.github/workflows/build-container/**'
+ - '.github/workflows/push-container/**'
jobs:
build-ci-container:
+ name: Build Container ${{ matrix.container-name }} ${{ (contains(matrix.runs-on, 'arm') && 'ARM64') || 'X64' }}
if: github.repository_owner == 'llvm'
runs-on: ${{ matrix.runs-on }}
strategy:
matrix:
- include:
- # The arch names should match the names used on dockerhub.
- # See https://github.com/docker-library/official-images#architectures-other-than-amd64
- - arch: amd64
- runs-on: depot-ubuntu-24.04-16
- - arch: arm64v8
- runs-on: depot-ubuntu-24.04-arm-16
+ runs-on:
+ - depot-ubuntu-24.04-16
+ - depot-ubuntu-24.04-arm-16
+ container-name:
+ - ''
+ - agent
+ test-command:
+ - cd $HOME && printf '#include <iostream>\nint main(int argc, char **argv) { std::cout << "Hello\\n"; }' | clang++ -x c++ - && ./a.out | grep Hello
steps:
- name: Checkout LLVM
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
- sparse-checkout: .github/workflows/containers/github-action-ci/
- # podman is not installed by default on the ARM64 images.
- - name: Install Podman
- if: runner.arch == 'ARM64'
- run: |
- sudo apt-get install podman
- - name: Write Variables
- id: vars
- run: |
- tag=$(git rev-parse --short=12 HEAD)
- container_name="ghcr.io/$GITHUB_REPOSITORY_OWNER/${{ matrix.arch }}/ci-ubuntu-24.04"
- echo "container-name=$container_name" >> $GITHUB_OUTPUT
- echo "container-name-agent=$container_name-agent" >> $GITHUB_OUTPUT
- echo "container-name-tag=$container_name:$tag" >> $GITHUB_OUTPUT
- echo "container-name-agent-tag=$container_name-agent:$tag" >> $GITHUB_OUTPUT
- echo "container-filename=$(echo $container_name:$tag | sed -e 's/\//-/g' -e 's/:/-/g').tar" >> $GITHUB_OUTPUT
- echo "container-agent-filename=$(echo $container_name-agent:$tag | sed -e 's/\//-/g' -e 's/:/-/g').tar" >> $GITHUB_OUTPUT
- - name: Build container
- working-directory: ./.github/workflows/containers/github-action-ci/
- run: |
- podman build --target ci-container -t ${{ steps.vars.outputs.container-name-tag }} .
- podman build --target ci-container-agent -t ${{ steps.vars.outputs.container-name-agent-tag }} .
+ sparse-checkout: |
+ .github/workflows/containers/github-action-ci/
+ .github/workflows/build-container
- # Save the container so we have it in case the push fails. This also
- # allows us to separate the push step into a different job so we can
- # maintain minimal permissions while building the container.
- - name: Save container image
- run: |
- podman save ${{ steps.vars.outputs.container-name-tag }} > ${{ steps.vars.outputs.container-filename }}
- podman save ${{ steps.vars.outputs.container-name-agent-tag }} > ${{ steps.vars.outputs.container-agent-filename }}
-
- - name: Upload container image
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ - name: Build Container
+ uses: ./.github/workflows/build-container
with:
- name: container-${{ matrix.arch }}
- path: "*.tar"
- retention-days: 14
-
- - name: Test Container
- run: |
- for image in ${{ steps.vars.outputs.container-name-tag }}; do
- # Use --pull=never to ensure we are testing the just built image.
- podman run --pull=never --rm -it $image /usr/bin/bash -x -c 'cd $HOME && printf '\''#include <iostream>\nint main(int argc, char **argv) { std::cout << "Hello\\n"; }'\'' | clang++ -x c++ - && ./a.out | grep Hello'
- done
+ container-name: ci-ubuntu-24.04${{ matrix.container-name && format('-{0}', matrix.container-name)}}
+ context: .github/workflows/containers/github-action-ci/
+ dockerfile: .github/workflows/containers/github-action-ci/Dockerfile
+ target: ci-container${{ matrix.container-name && format('-{0}', matrix.container-name) }}
+ test-command: ${{ matrix.test-command }}
push-ci-container:
if: github.event_name == 'push'
@@ -87,31 +61,12 @@ jobs:
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
steps:
- - name: Download container
- uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
-
- - name: Push Container
- run: |
- function push_container {
- image_name=$1
- latest_name=$(echo $image_name | sed 's/:[a-f0-9]\+$/:latest/g')
- podman tag $image_name $latest_name
- echo "Pushing $image_name ..."
- podman push $image_name
- echo "Pushing $latest_name ..."
- podman push $latest_name
- }
-
- podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io
- for f in $(find . -iname '*.tar'); do
- image_name=$(podman load -q -i $f | sed 's/Loaded image: //g')
- push_container $image_name
+ - name: Checkout LLVM
+ uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+ with:
+ sparse-checkout: |
+ .github/workflows/push-container
- if echo $image_name | grep '/amd64/'; then
- # For amd64, create an alias with the arch component removed.
- # This matches the convention used on dockerhub.
- default_image_name=$(echo $(dirname $(dirname $image_name))/$(basename $image_name))
- podman tag $image_name $default_image_name
- push_container $default_image_name
- fi
- done
+ - uses: ./.github/workflows/push-container
+ with:
+ token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/build-container/action.yml b/.github/workflows/build-container/action.yml
new file mode 100644
index 0000000..595c3f8
--- /dev/null
+++ b/.github/workflows/build-container/action.yml
@@ -0,0 +1,95 @@
+name: Build Container
+description: >-
+ Build and test a container using the standard llvm naming scheme for containers.
+
+inputs:
+ tag:
+ description: >-
+ The tag to use for this container.
+ required: false
+ container-name:
+ description: >-
+ The name for the container.
+ required: true
+ dockerfile:
+ description: >-
+ Path to docker file.
+ required: false
+ target:
+ description: >-
+ The container target to build 'passed to podman via ---target option'
+ required: false
+ context:
+ description: >-
+ Path to context for the container build.
+ required: false
+ test-command:
+ description: >-
+ Test command to run to ensure the container is working correctly.
+ required: false
+
+runs:
+ using: "composite"
+ steps:
+ # podman is not installed by default on the ARM64 images.
+ - name: Install Podman
+ if: runner.arch == 'ARM64'
+ shell: bash
+ run: |
+ sudo apt-get install podman
+
+ - name: Build Container
+ shell: bash
+ env:
+ INPUT_TAG: ${{inputs.tag }}
+ INPUT_CONTAINER_NAME: ${{ inputs.container-name }}
+ INPUT_TARGET: ${{ inputs.target }}
+ INPUT_DOCKERFILE: ${{ inputs.dockerfile }}
+ INPUT_CONTEXT: ${{ inputs.context }}
+ id: build
+ run: |
+ env
+ tag="${INPUT_TAG:-$(git rev-parse --short=12 HEAD)}"
+
+ case "$RUNNER_ARCH" in
+ ARM64)
+ container_arch="arm64v8"
+ ;;
+ *)
+ container_arch="amd64"
+ ;;
+ esac
+
+ container_name="ghcr.io/$GITHUB_REPOSITORY_OWNER/$container_arch/$INPUT_CONTAINER_NAME:$tag"
+ container_filename="$(echo $container_name | sed -e 's/\//-/g' -e 's/:/-/g').tar"
+ if [ -n "$INPUT_TARGET" ]; then
+ podman_options="$podman_options --target $INPUT_TARGET"
+ fi
+ if [ -n "$INPUT_DOCKERFILE" ]; then
+ podman_options="$podman_options -f $INPUT_DOCKERFILE"
+ fi
+ podman_options="$podman_options ${INPUT_CONTEXT:-.}"
+ echo "Podman Options: $podman_options"
+
+ podman build -t $container_name $podman_options
+
+ podman save $container_name > $container_filename
+
+ echo "container-full-name=$container_name" >> $GITHUB_OUTPUT
+
+ - name: Create container artifact
+ uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+ with:
+ name: ${{ inputs.container-name }}-${{ runner.arch }}
+ path: "*.tar"
+ retention-days: 14
+
+ - name: Test container
+ shell: bash
+ if: inputs.test-command
+ env:
+ INPUT_TEST_COMMAND: ${{ inputs.test-command }}
+ CONTAINER_FULL_NAME: ${{ steps.build.outputs.container-full-name }}
+ run: |
+ podman run --pull=never --rm -it $CONTAINER_FULL_NAME /usr/bin/bash -x -c "$INPUT_TEST_COMMAND"
+
diff --git a/.github/workflows/build-metrics-container.yml b/.github/workflows/build-metrics-container.yml
index 69b5715..786c412 100644
--- a/.github/workflows/build-metrics-container.yml
+++ b/.github/workflows/build-metrics-container.yml
@@ -49,7 +49,7 @@ jobs:
run: |
podman save ${{ steps.vars.outputs.container-name-tag }} > ${{ steps.vars.outputs.container-filename }}
- name: Upload Container Image
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
with:
name: container
path: ${{ steps.vars.outputs.container-filename }}
@@ -66,7 +66,7 @@ jobs:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
steps:
- name: Download Container
- uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
+ uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
with:
name: container
- name: Push Container
diff --git a/.github/workflows/ci-post-commit-analyzer.yml b/.github/workflows/ci-post-commit-analyzer.yml
index 49cf410..59df0b6 100644
--- a/.github/workflows/ci-post-commit-analyzer.yml
+++ b/.github/workflows/ci-post-commit-analyzer.yml
@@ -87,7 +87,7 @@ jobs:
scan-build --generate-index-only build/analyzer-results
- name: Upload Results
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
if: always()
with:
name: analyzer-results
diff --git a/.github/workflows/commit-access-review.yml b/.github/workflows/commit-access-review.yml
index 734dc21..7cdcfca 100644
--- a/.github/workflows/commit-access-review.yml
+++ b/.github/workflows/commit-access-review.yml
@@ -28,7 +28,7 @@ jobs:
python3 .github/workflows/commit-access-review.py $GITHUB_TOKEN
- name: Upload Triage List
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
with:
name: triagers
path: triagers.log
diff --git a/.github/workflows/containers/github-action-ci-tooling/Dockerfile b/.github/workflows/containers/github-action-ci-tooling/Dockerfile
index 8aaa2e8..707bdb3 100644
--- a/.github/workflows/containers/github-action-ci-tooling/Dockerfile
+++ b/.github/workflows/containers/github-action-ci-tooling/Dockerfile
@@ -22,6 +22,7 @@ RUN apt-get update && \
FROM docker.io/library/ubuntu:24.04 AS base
ENV LLVM_SYSROOT=/opt/llvm
+ENV PATH=${LLVM_SYSROOT}/bin:${PATH}
# Need nodejs for some of the GitHub actions.
# Need git for git-clang-format.
@@ -53,7 +54,6 @@ COPY --from=llvm-downloader /llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/bin/cla
/llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/bin/git-clang-format \
${LLVM_SYSROOT}/bin/
-ENV PATH=${LLVM_SYSROOT}/bin:${PATH}
# Install dependencies for 'pr-code-format.yml' job
COPY llvm/utils/git/requirements_formatting.txt requirements_formatting.txt
@@ -77,7 +77,6 @@ COPY clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py ${LLVM_SYSROOT}/bin/cl
RUN ln -s ${LLVM_SYSROOT}/bin/clang-${LLVM_VERSION_MAJOR} ${LLVM_SYSROOT}/bin/clang && \
ln -s ${LLVM_SYSROOT}/bin/clang ${LLVM_SYSROOT}/bin/clang++
-ENV PATH=${LLVM_SYSROOT}/bin:${PATH}
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y \
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 7374777c..2f9354a 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -209,7 +209,7 @@ jobs:
mkdir built-docs/flang
cp -r flang-build/docs/* built-docs/flang/
- name: Upload docs
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
with:
name: docs-output
path: built-docs/
diff --git a/.github/workflows/email-check.yaml b/.github/workflows/email-check.yaml
index 981c6fa..ba625b2 100644
--- a/.github/workflows/email-check.yaml
+++ b/.github/workflows/email-check.yaml
@@ -39,7 +39,7 @@ jobs:
[{"body" : "$COMMENT"}]
EOF
- - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
if: always()
with:
name: workflow-args
diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index 432c457..6377dd5 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -131,7 +131,7 @@ jobs:
sed -i 's/LLVM_[0-9]\+/LLVM_NOVERSION/' $lib-${{ matrix.ref }}.abi
done
- name: Upload ABI file
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
+ uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # 5.0.0
with:
name: ${{ matrix.name }}
path: '*${{ matrix.ref }}.abi'
@@ -144,12 +144,12 @@ jobs:
- abi-dump
steps:
- name: Download baseline
- uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
+ uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
with:
name: build-baseline
path: build-baseline
- name: Download latest
- uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
+ uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
with:
name: build-latest
path: build-latest
@@ -165,7 +165,7 @@ jobs:
done
- name: Upload ABI Comparison
if: always()
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
+ uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # 5.0.0
with:
name: compat-report-${{ github.sha }}
path: compat_reports/
diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 6c8f2cb..461b723 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -60,7 +60,7 @@ jobs:
env:
CC: ${{ matrix.cc }}
CXX: ${{ matrix.cxx }}
- - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
if: always()
with:
name: ${{ matrix.config }}-${{ matrix.cxx }}-results
@@ -105,7 +105,7 @@ jobs:
env:
CC: ${{ matrix.cc }}
CXX: ${{ matrix.cxx }}
- - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
if: always() # Upload artifacts even if the build or test suite fails
with:
name: ${{ matrix.config }}-${{ matrix.cxx }}-results
@@ -169,7 +169,7 @@ jobs:
env:
CC: clang-22
CXX: clang++-22
- - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
if: always()
with:
name: ${{ matrix.config }}-results
@@ -223,7 +223,7 @@ jobs:
source .venv/bin/activate
python -m pip install psutil
bash libcxx/utils/ci/run-buildbot ${{ matrix.config }}
- - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
if: always() # Upload artifacts even if the build or test suite fails
with:
name: macos-${{ matrix.config }}-results
diff --git a/.github/workflows/llvm-abi-tests.yml b/.github/workflows/llvm-abi-tests.yml
index 961f1cc..b0c2d32 100644
--- a/.github/workflows/llvm-abi-tests.yml
+++ b/.github/workflows/llvm-abi-tests.yml
@@ -128,14 +128,14 @@ jobs:
# Remove symbol versioning from dumps, so we can compare across major versions.
sed -i 's/LLVM_${{ matrix.llvm_version_major }}/LLVM_NOVERSION/' ${{ matrix.ref }}.abi
- name: Upload ABI file
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
+ uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # 5.0.0
with:
name: ${{ matrix.name }}
path: ${{ matrix.ref }}.abi
- name: Upload symbol list file
if: matrix.name == 'build-baseline'
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
+ uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # 5.0.0
with:
name: symbol-list
path: llvm.symbols
@@ -148,17 +148,17 @@ jobs:
- abi-dump
steps:
- name: Download baseline
- uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
+ uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
with:
name: build-baseline
path: build-baseline
- name: Download latest
- uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
+ uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
with:
name: build-latest
path: build-latest
- name: Download symbol list
- uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
+ uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
with:
name: symbol-list
path: symbol-list
@@ -179,7 +179,7 @@ jobs:
abi-compliance-checker $EXTRA_ARGS -l libLLVM.so -old build-baseline/*.abi -new build-latest/*.abi || test "${{ needs.abi-dump-setup.outputs.ABI_HEADERS }}" = "llvm-c"
- name: Upload ABI Comparison
if: always()
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
+ uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # 5.0.0
with:
name: compat-report-${{ github.sha }}
path: compat_reports/
diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index ac0689b..029325c5 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -56,7 +56,7 @@ jobs:
--end-rev HEAD \
--changed-files "$CHANGED_FILES"
- - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
if: always()
with:
name: workflow-args
diff --git a/.github/workflows/pr-code-lint.yml b/.github/workflows/pr-code-lint.yml
index 8ba93787..3cb564f 100644
--- a/.github/workflows/pr-code-lint.yml
+++ b/.github/workflows/pr-code-lint.yml
@@ -91,7 +91,7 @@ jobs:
--changed-files "$CHANGED_FILES"
- name: Upload results
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
if: always()
with:
name: workflow-args
diff --git a/.github/workflows/pr-request-release-note.yml b/.github/workflows/pr-request-release-note.yml
index 8162a89..c2dc2de 100644
--- a/.github/workflows/pr-request-release-note.yml
+++ b/.github/workflows/pr-request-release-note.yml
@@ -41,7 +41,7 @@ jobs:
request-release-note \
--pr-number ${{ github.event.pull_request.number}}
- - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
if: always()
with:
name: workflow-args
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 973d3ab..8503b2d 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -110,7 +110,7 @@ jobs:
# https://github.com/actions/upload-artifact/issues/569
continue-on-error: true
if: '!cancelled()'
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
with:
name: Premerge Artifacts (Linux ${{ runner.arch }})
path: artifacts/
@@ -165,7 +165,7 @@ jobs:
# https://github.com/actions/upload-artifact/issues/569
continue-on-error: true
if: '!cancelled()'
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
with:
name: Premerge Artifacts (Windows)
path: artifacts/
diff --git a/.github/workflows/push-container/action.yml b/.github/workflows/push-container/action.yml
new file mode 100644
index 0000000..51f4b2a
--- /dev/null
+++ b/.github/workflows/push-container/action.yml
@@ -0,0 +1,44 @@
+name: Push Container
+description: >-
+ Download all container artifacts for this job and push them to the GitHub registry.
+
+inputs:
+ token:
+ description: >-
+ Token to use to authenticate with the container registry.
+ required: true
+
+runs:
+ using: "composite"
+ steps:
+ - name: Download container
+ uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
+
+ - name: Push Container
+ env:
+ GITHUB_TOKEN: ${{ inputs.token }}
+ shell: bash
+ run: |
+ function push_container {
+ image_name=$1
+ latest_name=$(echo $image_name | sed 's/:[a-f0-9]\+$/:latest/g')
+ podman tag $image_name $latest_name
+ echo "Pushing $image_name ..."
+ podman push $image_name
+ echo "Pushing $latest_name ..."
+ podman push $latest_name
+ }
+
+ podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io
+ for f in $(find . -iname '*.tar'); do
+ image_name=$(podman load -q -i $f | sed 's/Loaded image: //g')
+ push_container $image_name
+
+ if echo $image_name | grep '/amd64/'; then
+ # For amd64, create an alias with the arch component removed.
+ # This matches the convention used on dockerhub.
+ default_image_name=$(echo $(dirname $(dirname $image_name))/$(basename $image_name))
+ podman tag $image_name $default_image_name
+ push_container $default_image_name
+ fi
+ done
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 25f426b..9263754 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -225,7 +225,7 @@ jobs:
release_dir=`find ${{ steps.setup-stage.outputs.build-prefix }}/build -iname 'stage2-bins'`
mv $release_dir/${{ needs.prepare.outputs.release-binary-filename }} .
- - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
with:
name: ${{ runner.os }}-${{ runner.arch }}-release-binary
# Due to path differences on Windows when running in bash vs running on node,
@@ -263,7 +263,7 @@ jobs:
sparse-checkout-cone-mode: false
- name: 'Download artifact'
- uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
+ uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
with:
pattern: '*-release-binary'
merge-multiple: true
@@ -279,7 +279,7 @@ jobs:
mv ${{ steps.provenance.outputs.bundle-path }} ${{ needs.prepare.outputs.release-binary-filename }}.jsonl
- name: Upload Build Provenance
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
with:
name: ${{ needs.prepare.outputs.release-binary-filename }}-attestation
path: ${{ needs.prepare.outputs.release-binary-filename }}.jsonl
diff --git a/.github/workflows/release-documentation.yml b/.github/workflows/release-documentation.yml
index 4cf973d..9b77112 100644
--- a/.github/workflows/release-documentation.yml
+++ b/.github/workflows/release-documentation.yml
@@ -63,7 +63,7 @@ jobs:
./llvm/utils/release/build-docs.sh -release "${{ inputs.release-version }}" -no-doxygen
- name: Create Release Notes Artifact
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
+ uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # 5.0.0
with:
name: release-notes
path: docs-build/html-export/
diff --git a/.github/workflows/release-sources.yml b/.github/workflows/release-sources.yml
index 2278b96..70323c2 100644
--- a/.github/workflows/release-sources.yml
+++ b/.github/workflows/release-sources.yml
@@ -99,7 +99,7 @@ jobs:
run: |
mv ${{ steps.provenance.outputs.bundle-path }} .
- name: Create Tarball Artifacts
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
with:
path: |
*.xz
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index bd3277a..0845119 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -49,7 +49,7 @@ jobs:
# Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
# format to the repository Actions tab.
- name: "Upload artifact"
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+ uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
with:
name: SARIF file
path: results.sarif
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 7af32c8..b478925 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -1010,14 +1010,12 @@ bool BinaryContext::hasValidCodePadding(const BinaryFunction &BF) {
return Offset - StartOffset;
};
- // Skip a sequence of zero bytes. For AArch64 we only skip 4 bytes of zeros
- // in case the following zeros belong to constant island or veneer.
+ // Skip a sequence of zero bytes. For AArch64 we only skip 4's exact
+ // multiple number of zeros in case the following zeros belong to veneer.
auto skipZeros = [&]() {
const uint64_t StartOffset = Offset;
uint64_t CurrentOffset = Offset;
- for (; CurrentOffset < BF.getMaxSize() &&
- (!isAArch64() || CurrentOffset < StartOffset + 4);
- ++CurrentOffset)
+ for (; CurrentOffset < BF.getMaxSize(); ++CurrentOffset)
if ((*FunctionData)[CurrentOffset] != 0)
break;
diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
index 3d4d71a..be3d0cf 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -200,16 +200,17 @@ endforeach()
if(FUCHSIA_SDK)
set(FUCHSIA_aarch64-unknown-fuchsia_NAME arm64)
+ set(FUCHSIA_arm-unknown-fuchsia_NAME arm)
set(FUCHSIA_i386-unknown-fuchsia_NAME x64)
set(FUCHSIA_x86_64-unknown-fuchsia_NAME x64)
set(FUCHSIA_riscv64-unknown-fuchsia_NAME riscv64)
- foreach(target i386-unknown-fuchsia;x86_64-unknown-fuchsia;aarch64-unknown-fuchsia;riscv64-unknown-fuchsia)
+ foreach(target i386-unknown-fuchsia;x86_64-unknown-fuchsia;aarch64-unknown-fuchsia;arm-unknown-fuchsia;riscv64-unknown-fuchsia)
set(FUCHSIA_${target}_COMPILER_FLAGS "--target=${target} -I${FUCHSIA_SDK}/pkg/sync/include -I${FUCHSIA_SDK}/pkg/fdio/include")
set(FUCHSIA_${target}_LINKER_FLAGS "-L${FUCHSIA_SDK}/arch/${FUCHSIA_${target}_NAME}/lib")
set(FUCHSIA_${target}_SYSROOT "${FUCHSIA_SDK}/arch/${FUCHSIA_${target}_NAME}/sysroot")
endforeach()
- foreach(target i386-unknown-fuchsia;x86_64-unknown-fuchsia;aarch64-unknown-fuchsia;riscv64-unknown-fuchsia)
+ foreach(target i386-unknown-fuchsia;x86_64-unknown-fuchsia;aarch64-unknown-fuchsia;arm-unknown-fuchsia;riscv64-unknown-fuchsia)
# Set the per-target builtins options.
list(APPEND BUILTIN_TARGETS "${target}")
set(BUILTINS_${target}_CMAKE_SYSTEM_NAME Fuchsia CACHE STRING "")
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ae21c69b..e8339fa 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -447,6 +447,7 @@ Bug Fixes in This Version
- Fixed a failed assertion with empty filename in ``#embed`` directive. (#GH162951)
- Fixed a crash triggered by unterminated ``__has_embed``. (#GH162953)
- Accept empty enumerations in MSVC-compatible C mode. (#GH114402)
+- Fixed false-positive shadow diagnostics for lambdas in explicit object member functions. (#GH163731)
Bug Fixes to Compiler Builtins
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -513,6 +514,7 @@ Bug Fixes to C++ Support
- Fixed a template depth issue when parsing lambdas inside a type constraint. (#GH162092)
- Diagnose unresolved overload sets in non-dependent compound requirements. (#GH51246) (#GH97753)
- Fix a crash when extracting unavailable member type from alias in template deduction. (#GH165560)
+- Fix incorrect diagnostics for lambdas with init-captures inside braced initializers. (#GH163498)
Bug Fixes to AST Handling
^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/AST/JSONNodeDumper.h b/clang/include/clang/AST/JSONNodeDumper.h
index 427a9c5..d364795 100644
--- a/clang/include/clang/AST/JSONNodeDumper.h
+++ b/clang/include/clang/AST/JSONNodeDumper.h
@@ -149,7 +149,7 @@ class JSONNodeDumper
void writeIncludeStack(PresumedLoc Loc, bool JustFirst = false);
// Writes the attributes of a SourceLocation object without.
- void writeBareSourceLocation(SourceLocation Loc, bool IsSpelling);
+ void writeBareSourceLocation(SourceLocation Loc);
// Writes the attributes of a SourceLocation to JSON based on its presumed
// spelling location. If the given location represents a macro invocation,
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 83980e3..afd44a1 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -312,6 +312,8 @@ def warn_drv_yc_multiple_inputs_clang_cl : Warning<
def warn_drv_potentially_misspelled_joined_argument : Warning<
"joined argument treated as '%0'; did you mean '%1'?">, InGroup<UnknownArgument>;
+def err_drv_too_many_actions: Error<
+ "only one action option is allowed. Got %0">;
def err_drv_invalid_value : Error<"invalid value '%1' in '%0'">;
def err_drv_invalid_int_value : Error<"invalid integral value '%1' in '%0'">;
def err_drv_invalid_value_with_suggestion : Error<
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index eb8d7d1..0470645 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11309,9 +11309,6 @@ public:
InventedParameterInfos.end());
}
- /// The number of SFINAE diagnostics that have been trapped.
- unsigned NumSFINAEErrors;
-
ArrayRef<sema::FunctionScopeInfo *> getFunctionScopes() const {
return llvm::ArrayRef(FunctionScopes.begin() + FunctionScopesStart,
FunctionScopes.end());
@@ -12385,49 +12382,65 @@ public:
///@{
public:
- /// When true, access checking violations are treated as SFINAE
- /// failures rather than hard errors.
- bool AccessCheckingSFINAE;
+ class SFINAETrap;
+
+ struct SFINAEContextBase {
+ SFINAEContextBase(Sema &S, SFINAETrap *Cur)
+ : S(S), Prev(std::exchange(S.CurrentSFINAEContext, Cur)) {}
+
+ protected:
+ Sema &S;
+ ~SFINAEContextBase() { S.CurrentSFINAEContext = Prev; }
+
+ private:
+ SFINAETrap *Prev;
+ };
+
+ struct NonSFINAEContext : SFINAEContextBase {
+ NonSFINAEContext(Sema &S) : SFINAEContextBase(S, nullptr) {}
+ };
/// RAII class used to determine whether SFINAE has
/// trapped any errors that occur during template argument
/// deduction.
- class SFINAETrap {
- Sema &SemaRef;
- unsigned PrevSFINAEErrors;
- bool PrevInNonInstantiationSFINAEContext;
- bool PrevAccessCheckingSFINAE;
- bool PrevLastDiagnosticIgnored;
+ class SFINAETrap : SFINAEContextBase {
+ bool HasErrorOcurred = false;
+ bool WithAccessChecking = false;
+ bool PrevLastDiagnosticIgnored =
+ S.getDiagnostics().isLastDiagnosticIgnored();
+ sema::TemplateDeductionInfo *DeductionInfo = nullptr;
+
+ SFINAETrap(Sema &S, sema::TemplateDeductionInfo *Info,
+ bool WithAccessChecking)
+ : SFINAEContextBase(S, this), WithAccessChecking(WithAccessChecking),
+ DeductionInfo(Info) {}
public:
- /// \param ForValidityCheck If true, discard all diagnostics (from the
+ /// \param WithAccessChecking If true, discard all diagnostics (from the
/// immediate context) instead of adding them to the currently active
- /// \ref TemplateDeductionInfo (as returned by \ref isSFINAEContext).
- explicit SFINAETrap(Sema &SemaRef, bool ForValidityCheck = false)
- : SemaRef(SemaRef), PrevSFINAEErrors(SemaRef.NumSFINAEErrors),
- PrevInNonInstantiationSFINAEContext(
- SemaRef.InNonInstantiationSFINAEContext),
- PrevAccessCheckingSFINAE(SemaRef.AccessCheckingSFINAE),
- PrevLastDiagnosticIgnored(
- SemaRef.getDiagnostics().isLastDiagnosticIgnored()) {
- if (ForValidityCheck || !SemaRef.isSFINAEContext())
- SemaRef.InNonInstantiationSFINAEContext = true;
- SemaRef.AccessCheckingSFINAE = ForValidityCheck;
- }
+ /// \ref TemplateDeductionInfo.
+ explicit SFINAETrap(Sema &S, bool WithAccessChecking = false)
+ : SFINAETrap(S, /*Info=*/nullptr, WithAccessChecking) {}
+
+ SFINAETrap(Sema &S, sema::TemplateDeductionInfo &Info)
+ : SFINAETrap(S, &Info, /*WithAccessChecking=*/false) {}
~SFINAETrap() {
- SemaRef.NumSFINAEErrors = PrevSFINAEErrors;
- SemaRef.InNonInstantiationSFINAEContext =
- PrevInNonInstantiationSFINAEContext;
- SemaRef.AccessCheckingSFINAE = PrevAccessCheckingSFINAE;
- SemaRef.getDiagnostics().setLastDiagnosticIgnored(
- PrevLastDiagnosticIgnored);
+ S.getDiagnostics().setLastDiagnosticIgnored(PrevLastDiagnosticIgnored);
}
- /// Determine whether any SFINAE errors have been trapped.
- bool hasErrorOccurred() const {
- return SemaRef.NumSFINAEErrors > PrevSFINAEErrors;
+ SFINAETrap(const SFINAETrap &) = delete;
+ SFINAETrap &operator=(const SFINAETrap &) = delete;
+
+ sema::TemplateDeductionInfo *getDeductionInfo() const {
+ return DeductionInfo;
}
+
+ /// Determine whether any SFINAE errors have been trapped.
+ bool hasErrorOccurred() const { return HasErrorOcurred; }
+ void setErrorOccurred() { HasErrorOcurred = true; }
+
+ bool withAccessChecking() const { return WithAccessChecking; }
};
/// RAII class used to indicate that we are performing provisional
@@ -13148,9 +13161,6 @@ public:
PartialOrderingTTP,
} Kind;
- /// Was the enclosing context a non-instantiation SFINAE context?
- bool SavedInNonInstantiationSFINAEContext;
-
/// Whether we're substituting into constraints.
bool InConstraintSubstitution;
@@ -13195,22 +13205,15 @@ public:
return {TemplateArgs, NumTemplateArgs};
}
- /// The template deduction info object associated with the
- /// substitution or checking of explicit or deduced template arguments.
- sema::TemplateDeductionInfo *DeductionInfo;
-
/// The source range that covers the construct that cause
/// the instantiation, e.g., the template-id that causes a class
/// template instantiation.
SourceRange InstantiationRange;
CodeSynthesisContext()
- : Kind(TemplateInstantiation),
- SavedInNonInstantiationSFINAEContext(false),
- InConstraintSubstitution(false),
+ : Kind(TemplateInstantiation), InConstraintSubstitution(false),
InParameterMappingSubstitution(false), Entity(nullptr),
- Template(nullptr), TemplateArgs(nullptr), NumTemplateArgs(0),
- DeductionInfo(nullptr) {}
+ Template(nullptr), TemplateArgs(nullptr), NumTemplateArgs(0) {}
/// Determines whether this template is an actual instantiation
/// that should be counted toward the maximum instantiation depth.
@@ -13262,7 +13265,6 @@ public:
FunctionTemplateDecl *FunctionTemplate,
ArrayRef<TemplateArgument> TemplateArgs,
CodeSynthesisContext::SynthesisKind Kind,
- sema::TemplateDeductionInfo &DeductionInfo,
SourceRange InstantiationRange = SourceRange());
/// Note that we are instantiating as part of template
@@ -13270,7 +13272,6 @@ public:
InstantiatingTemplate(Sema &SemaRef, SourceLocation PointOfInstantiation,
TemplateDecl *Template,
ArrayRef<TemplateArgument> TemplateArgs,
- sema::TemplateDeductionInfo &DeductionInfo,
SourceRange InstantiationRange = SourceRange());
/// Note that we are instantiating as part of template
@@ -13279,7 +13280,6 @@ public:
InstantiatingTemplate(Sema &SemaRef, SourceLocation PointOfInstantiation,
ClassTemplatePartialSpecializationDecl *PartialSpec,
ArrayRef<TemplateArgument> TemplateArgs,
- sema::TemplateDeductionInfo &DeductionInfo,
SourceRange InstantiationRange = SourceRange());
/// Note that we are instantiating as part of template
@@ -13288,7 +13288,6 @@ public:
InstantiatingTemplate(Sema &SemaRef, SourceLocation PointOfInstantiation,
VarTemplatePartialSpecializationDecl *PartialSpec,
ArrayRef<TemplateArgument> TemplateArgs,
- sema::TemplateDeductionInfo &DeductionInfo,
SourceRange InstantiationRange = SourceRange());
/// Note that we are instantiating a default argument for a function
@@ -13334,7 +13333,6 @@ public:
/// concept.
InstantiatingTemplate(Sema &SemaRef, SourceLocation PointOfInstantiation,
ConstraintSubstitution, NamedDecl *Template,
- sema::TemplateDeductionInfo &DeductionInfo,
SourceRange InstantiationRange);
struct ConstraintNormalization {};
@@ -13354,7 +13352,6 @@ public:
/// a requirement of a requires expression.
InstantiatingTemplate(Sema &SemaRef, SourceLocation PointOfInstantiation,
concepts::Requirement *Req,
- sema::TemplateDeductionInfo &DeductionInfo,
SourceRange InstantiationRange = SourceRange());
/// \brief Note that we are checking the satisfaction of the constraint
@@ -13366,7 +13363,6 @@ public:
/// \brief Note that we are checking a requires clause.
InstantiatingTemplate(Sema &SemaRef, SourceLocation PointOfInstantiation,
const RequiresExpr *E,
- sema::TemplateDeductionInfo &DeductionInfo,
SourceRange InstantiationRange);
struct BuildingDeductionGuidesTag {};
@@ -13399,8 +13395,7 @@ public:
SourceLocation PointOfInstantiation,
SourceRange InstantiationRange, Decl *Entity,
NamedDecl *Template = nullptr,
- ArrayRef<TemplateArgument> TemplateArgs = {},
- sema::TemplateDeductionInfo *DeductionInfo = nullptr);
+ ArrayRef<TemplateArgument> TemplateArgs = {});
InstantiatingTemplate(const InstantiatingTemplate &) = delete;
@@ -13541,12 +13536,7 @@ public:
/// recent visible declaration of that namespace.
llvm::DenseMap<NamedDecl *, NamedDecl *> VisibleNamespaceCache;
- /// Whether we are in a SFINAE context that is not associated with
- /// template instantiation.
- ///
- /// This is used when setting up a SFINAE trap (\c see SFINAETrap) outside
- /// of a template instantiation or template argument deduction.
- bool InNonInstantiationSFINAEContext;
+ SFINAETrap *CurrentSFINAEContext = nullptr;
/// The number of \p CodeSynthesisContexts that are not template
/// instantiations and, therefore, should not be counted as part of the
@@ -13617,15 +13607,13 @@ public:
PrintInstantiationStack(getDefaultDiagFunc());
}
- /// Determines whether we are currently in a context where
- /// template argument substitution failures are not considered
- /// errors.
- ///
- /// \returns An empty \c Optional if we're not in a SFINAE context.
- /// Otherwise, contains a pointer that, if non-NULL, contains the nearest
- /// template-deduction context object, which can be used to capture
- /// diagnostics that will be suppressed.
- std::optional<sema::TemplateDeductionInfo *> isSFINAEContext() const;
+ /// Returns a pointer to the current SFINAE context, if any.
+ [[nodiscard]] SFINAETrap *getSFINAEContext() const {
+ return CurrentSFINAEContext;
+ }
+ [[nodiscard]] bool isSFINAEContext() const {
+ return CurrentSFINAEContext != nullptr;
+ }
/// Perform substitution on the type T with a given set of template
/// arguments.
@@ -14637,7 +14625,8 @@ public:
ArrayRef<UnexpandedParameterPack> Unexpanded,
const MultiLevelTemplateArgumentList &TemplateArgs,
bool FailOnPackProducingTemplates, bool &ShouldExpand,
- bool &RetainExpansion, UnsignedOrNone &NumExpansions);
+ bool &RetainExpansion, UnsignedOrNone &NumExpansions,
+ bool Diagnose = true);
/// Determine the number of arguments in the given pack expansion
/// type.
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 193f87c..4f6f52b 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -3829,6 +3829,351 @@ static bool CheckArraySize(EvalInfo &Info, const ConstantArrayType *CAT,
/*Diag=*/true);
}
+static bool handleScalarCast(EvalInfo &Info, const FPOptions FPO, const Expr *E,
+ QualType SourceTy, QualType DestTy,
+ APValue const &Original, APValue &Result) {
+ // boolean must be checked before integer
+ // since IsIntegerType() is true for bool
+ if (SourceTy->isBooleanType()) {
+ if (DestTy->isBooleanType()) {
+ Result = Original;
+ return true;
+ }
+ if (DestTy->isIntegerType() || DestTy->isRealFloatingType()) {
+ bool BoolResult;
+ if (!HandleConversionToBool(Original, BoolResult))
+ return false;
+ uint64_t IntResult = BoolResult;
+ QualType IntType = DestTy->isIntegerType()
+ ? DestTy
+ : Info.Ctx.getIntTypeForBitwidth(64, false);
+ Result = APValue(Info.Ctx.MakeIntValue(IntResult, IntType));
+ }
+ if (DestTy->isRealFloatingType()) {
+ APValue Result2 = APValue(APFloat(0.0));
+ if (!HandleIntToFloatCast(Info, E, FPO,
+ Info.Ctx.getIntTypeForBitwidth(64, false),
+ Result.getInt(), DestTy, Result2.getFloat()))
+ return false;
+ Result = Result2;
+ }
+ return true;
+ }
+ if (SourceTy->isIntegerType()) {
+ if (DestTy->isRealFloatingType()) {
+ Result = APValue(APFloat(0.0));
+ return HandleIntToFloatCast(Info, E, FPO, SourceTy, Original.getInt(),
+ DestTy, Result.getFloat());
+ }
+ if (DestTy->isBooleanType()) {
+ bool BoolResult;
+ if (!HandleConversionToBool(Original, BoolResult))
+ return false;
+ uint64_t IntResult = BoolResult;
+ Result = APValue(Info.Ctx.MakeIntValue(IntResult, DestTy));
+ return true;
+ }
+ if (DestTy->isIntegerType()) {
+ Result = APValue(
+ HandleIntToIntCast(Info, E, DestTy, SourceTy, Original.getInt()));
+ return true;
+ }
+ } else if (SourceTy->isRealFloatingType()) {
+ if (DestTy->isRealFloatingType()) {
+ Result = Original;
+ return HandleFloatToFloatCast(Info, E, SourceTy, DestTy,
+ Result.getFloat());
+ }
+ if (DestTy->isBooleanType()) {
+ bool BoolResult;
+ if (!HandleConversionToBool(Original, BoolResult))
+ return false;
+ uint64_t IntResult = BoolResult;
+ Result = APValue(Info.Ctx.MakeIntValue(IntResult, DestTy));
+ return true;
+ }
+ if (DestTy->isIntegerType()) {
+ Result = APValue(APSInt());
+ return HandleFloatToIntCast(Info, E, SourceTy, Original.getFloat(),
+ DestTy, Result.getInt());
+ }
+ }
+
+ Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
+ return false;
+}
+
+// do the heavy lifting for casting to aggregate types
+// because we have to deal with bitfields specially
+static bool constructAggregate(EvalInfo &Info, const FPOptions FPO,
+ const Expr *E, APValue &Result,
+ QualType ResultType,
+ SmallVectorImpl<APValue> &Elements,
+ SmallVectorImpl<QualType> &ElTypes) {
+
+ SmallVector<std::tuple<APValue *, QualType, unsigned>> WorkList = {
+ {&Result, ResultType, 0}};
+
+ unsigned ElI = 0;
+ while (!WorkList.empty() && ElI < Elements.size()) {
+ auto [Res, Type, BitWidth] = WorkList.pop_back_val();
+
+ if (Type->isRealFloatingType()) {
+ if (!handleScalarCast(Info, FPO, E, ElTypes[ElI], Type, Elements[ElI],
+ *Res))
+ return false;
+ ElI++;
+ continue;
+ }
+ if (Type->isIntegerType()) {
+ if (!handleScalarCast(Info, FPO, E, ElTypes[ElI], Type, Elements[ElI],
+ *Res))
+ return false;
+ if (BitWidth > 0) {
+ if (!Res->isInt())
+ return false;
+ APSInt &Int = Res->getInt();
+ unsigned OldBitWidth = Int.getBitWidth();
+ unsigned NewBitWidth = BitWidth;
+ if (NewBitWidth < OldBitWidth)
+ Int = Int.trunc(NewBitWidth).extend(OldBitWidth);
+ }
+ ElI++;
+ continue;
+ }
+ if (Type->isVectorType()) {
+ QualType ElTy = Type->castAs<VectorType>()->getElementType();
+ unsigned NumEl = Type->castAs<VectorType>()->getNumElements();
+ SmallVector<APValue> Vals(NumEl);
+ for (unsigned I = 0; I < NumEl; ++I) {
+ if (!handleScalarCast(Info, FPO, E, ElTypes[ElI], ElTy, Elements[ElI],
+ Vals[I]))
+ return false;
+ ElI++;
+ }
+ *Res = APValue(Vals.data(), NumEl);
+ continue;
+ }
+ if (Type->isConstantArrayType()) {
+ QualType ElTy = cast<ConstantArrayType>(Info.Ctx.getAsArrayType(Type))
+ ->getElementType();
+ uint64_t Size =
+ cast<ConstantArrayType>(Info.Ctx.getAsArrayType(Type))->getZExtSize();
+ *Res = APValue(APValue::UninitArray(), Size, Size);
+ for (int64_t I = Size - 1; I > -1; --I)
+ WorkList.emplace_back(&Res->getArrayInitializedElt(I), ElTy, 0u);
+ continue;
+ }
+ if (Type->isRecordType()) {
+ const RecordDecl *RD = Type->getAsRecordDecl();
+
+ unsigned NumBases = 0;
+ if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD))
+ NumBases = CXXRD->getNumBases();
+
+ *Res = APValue(APValue::UninitStruct(), NumBases,
+ std::distance(RD->field_begin(), RD->field_end()));
+
+ SmallVector<std::tuple<APValue *, QualType, unsigned>> ReverseList;
+ // we need to traverse backwards
+ // Visit the base classes.
+ if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+ if (CXXRD->getNumBases() > 0) {
+ assert(CXXRD->getNumBases() == 1);
+ const CXXBaseSpecifier &BS = CXXRD->bases_begin()[0];
+ ReverseList.emplace_back(&Res->getStructBase(0), BS.getType(), 0u);
+ }
+ }
+
+ // Visit the fields.
+ for (FieldDecl *FD : RD->fields()) {
+ unsigned FDBW = 0;
+ if (FD->isUnnamedBitField())
+ continue;
+ if (FD->isBitField()) {
+ FDBW = FD->getBitWidthValue();
+ }
+
+ ReverseList.emplace_back(&Res->getStructField(FD->getFieldIndex()),
+ FD->getType(), FDBW);
+ }
+
+ std::reverse(ReverseList.begin(), ReverseList.end());
+ llvm::append_range(WorkList, ReverseList);
+ continue;
+ }
+ Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
+ return false;
+ }
+ return true;
+}
+
+static bool handleElementwiseCast(EvalInfo &Info, const Expr *E,
+ const FPOptions FPO,
+ SmallVectorImpl<APValue> &Elements,
+ SmallVectorImpl<QualType> &SrcTypes,
+ SmallVectorImpl<QualType> &DestTypes,
+ SmallVectorImpl<APValue> &Results) {
+
+ assert((Elements.size() == SrcTypes.size()) &&
+ (Elements.size() == DestTypes.size()));
+
+ for (unsigned I = 0, ESz = Elements.size(); I < ESz; ++I) {
+ APValue Original = Elements[I];
+ QualType SourceTy = SrcTypes[I];
+ QualType DestTy = DestTypes[I];
+
+ if (!handleScalarCast(Info, FPO, E, SourceTy, DestTy, Original, Results[I]))
+ return false;
+ }
+ return true;
+}
+
+static unsigned elementwiseSize(EvalInfo &Info, QualType BaseTy) {
+
+ SmallVector<QualType> WorkList = {BaseTy};
+
+ unsigned Size = 0;
+ while (!WorkList.empty()) {
+ QualType Type = WorkList.pop_back_val();
+ if (Type->isRealFloatingType() || Type->isIntegerType() ||
+ Type->isBooleanType()) {
+ ++Size;
+ continue;
+ }
+ if (Type->isVectorType()) {
+ unsigned NumEl = Type->castAs<VectorType>()->getNumElements();
+ Size += NumEl;
+ continue;
+ }
+ if (Type->isConstantArrayType()) {
+ QualType ElTy = cast<ConstantArrayType>(Info.Ctx.getAsArrayType(Type))
+ ->getElementType();
+ uint64_t ArrSize =
+ cast<ConstantArrayType>(Info.Ctx.getAsArrayType(Type))->getZExtSize();
+ for (uint64_t I = 0; I < ArrSize; ++I) {
+ WorkList.push_back(ElTy);
+ }
+ continue;
+ }
+ if (Type->isRecordType()) {
+ const RecordDecl *RD = Type->getAsRecordDecl();
+
+ // Visit the base classes.
+ if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+ if (CXXRD->getNumBases() > 0) {
+ assert(CXXRD->getNumBases() == 1);
+ const CXXBaseSpecifier &BS = CXXRD->bases_begin()[0];
+ WorkList.push_back(BS.getType());
+ }
+ }
+
+ // visit the fields.
+ for (FieldDecl *FD : RD->fields()) {
+ if (FD->isUnnamedBitField())
+ continue;
+ WorkList.push_back(FD->getType());
+ }
+ continue;
+ }
+ }
+ return Size;
+}
+
+static bool hlslAggSplatHelper(EvalInfo &Info, const Expr *E, APValue &SrcVal,
+ QualType &SrcTy) {
+ SrcTy = E->getType();
+
+ if (!Evaluate(SrcVal, Info, E))
+ return false;
+
+ assert(SrcVal.isFloat() || SrcVal.isInt() ||
+ (SrcVal.isVector() && SrcVal.getVectorLength() == 1) &&
+ "Not a valid HLSLAggregateSplatCast.");
+
+ if (SrcVal.isVector()) {
+ assert(SrcTy->isVectorType() && "Type mismatch.");
+ SrcTy = SrcTy->castAs<VectorType>()->getElementType();
+ SrcVal = SrcVal.getVectorElt(0);
+ }
+ return true;
+}
+
+static bool flattenAPValue(EvalInfo &Info, const Expr *E, APValue Value,
+ QualType BaseTy, SmallVectorImpl<APValue> &Elements,
+ SmallVectorImpl<QualType> &Types, unsigned Size) {
+
+ SmallVector<std::pair<APValue, QualType>> WorkList = {{Value, BaseTy}};
+ unsigned Populated = 0;
+ while (!WorkList.empty() && Populated < Size) {
+ auto [Work, Type] = WorkList.pop_back_val();
+
+ if (Work.isFloat() || Work.isInt()) {
+ Elements.push_back(Work);
+ Types.push_back(Type);
+ Populated++;
+ continue;
+ }
+ if (Work.isVector()) {
+ assert(Type->isVectorType() && "Type mismatch.");
+ QualType ElTy = Type->castAs<VectorType>()->getElementType();
+ for (unsigned I = 0; I < Work.getVectorLength() && Populated < Size;
+ I++) {
+ Elements.push_back(Work.getVectorElt(I));
+ Types.push_back(ElTy);
+ Populated++;
+ }
+ continue;
+ }
+ if (Work.isArray()) {
+ assert(Type->isConstantArrayType() && "Type mismatch.");
+ QualType ElTy = cast<ConstantArrayType>(Info.Ctx.getAsArrayType(Type))
+ ->getElementType();
+ for (int64_t I = Work.getArraySize() - 1; I > -1; --I) {
+ WorkList.emplace_back(Work.getArrayInitializedElt(I), ElTy);
+ }
+ continue;
+ }
+
+ if (Work.isStruct()) {
+ assert(Type->isRecordType() && "Type mismatch.");
+
+ const RecordDecl *RD = Type->getAsRecordDecl();
+
+ SmallVector<std::pair<APValue, QualType>> ReverseList;
+ // Visit the fields.
+ for (FieldDecl *FD : RD->fields()) {
+ if (FD->isUnnamedBitField())
+ continue;
+ ReverseList.emplace_back(Work.getStructField(FD->getFieldIndex()),
+ FD->getType());
+ }
+
+ std::reverse(ReverseList.begin(), ReverseList.end());
+ llvm::append_range(WorkList, ReverseList);
+
+ // Visit the base classes.
+ if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+ if (CXXRD->getNumBases() > 0) {
+ assert(CXXRD->getNumBases() == 1);
+ const CXXBaseSpecifier &BS = CXXRD->bases_begin()[0];
+ const APValue &Base = Work.getStructBase(0);
+
+ // Can happen in error cases.
+ if (!Base.isStruct())
+ return false;
+
+ WorkList.emplace_back(Base, BS.getType());
+ }
+ }
+ continue;
+ }
+ Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
+ return false;
+ }
+ return true;
+}
+
namespace {
/// A handle to a complete object (an object that is not a subobject of
/// another object).
@@ -4639,6 +4984,30 @@ handleLValueToRValueConversion(EvalInfo &Info, const Expr *Conv, QualType Type,
return Obj && extractSubobject(Info, Conv, Obj, LVal.Designator, RVal, AK);
}
+static bool hlslElementwiseCastHelper(EvalInfo &Info, const Expr *E,
+ QualType DestTy,
+ SmallVectorImpl<APValue> &SrcVals,
+ SmallVectorImpl<QualType> &SrcTypes) {
+ APValue Val;
+ if (!Evaluate(Val, Info, E))
+ return false;
+
+ // must be dealing with a record
+ if (Val.isLValue()) {
+ LValue LVal;
+ LVal.setFrom(Info.Ctx, Val);
+ if (!handleLValueToRValueConversion(Info, E, E->getType(), LVal, Val))
+ return false;
+ }
+
+ unsigned NEls = elementwiseSize(Info, DestTy);
+ // flatten the source
+ if (!flattenAPValue(Info, E, Val, E->getType(), SrcVals, SrcTypes, NEls))
+ return false;
+
+ return true;
+}
+
/// Perform an assignment of Val to LVal. Takes ownership of Val.
static bool handleAssignment(EvalInfo &Info, const Expr *E, const LValue &LVal,
QualType LValType, APValue &Val) {
@@ -8670,6 +9039,25 @@ public:
case CK_UserDefinedConversion:
return StmtVisitorTy::Visit(E->getSubExpr());
+ case CK_HLSLArrayRValue: {
+ const Expr *SubExpr = E->getSubExpr();
+ if (!SubExpr->isGLValue()) {
+ APValue Val;
+ if (!Evaluate(Val, Info, SubExpr))
+ return false;
+ return DerivedSuccess(Val, E);
+ }
+
+ LValue LVal;
+ if (!EvaluateLValue(SubExpr, LVal, Info))
+ return false;
+ APValue RVal;
+ // Note, we use the subexpression's type in order to retain cv-qualifiers.
+ if (!handleLValueToRValueConversion(Info, E, SubExpr->getType(), LVal,
+ RVal))
+ return false;
+ return DerivedSuccess(RVal, E);
+ }
case CK_LValueToRValue: {
LValue LVal;
if (!EvaluateLValue(E->getSubExpr(), LVal, Info))
@@ -10854,6 +11242,42 @@ bool RecordExprEvaluator::VisitCastExpr(const CastExpr *E) {
Result = *Value;
return true;
}
+ case CK_HLSLAggregateSplatCast: {
+ APValue Val;
+ QualType ValTy;
+
+ if (!hlslAggSplatHelper(Info, E->getSubExpr(), Val, ValTy))
+ return false;
+
+ unsigned NEls = elementwiseSize(Info, E->getType());
+ // splat our Val
+ SmallVector<APValue> SplatEls(NEls, Val);
+ SmallVector<QualType> SplatType(NEls, ValTy);
+
+ // cast the elements and construct our struct result
+ const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
+ if (!constructAggregate(Info, FPO, E, Result, E->getType(), SplatEls,
+ SplatType))
+ return false;
+
+ return true;
+ }
+ case CK_HLSLElementwiseCast: {
+ SmallVector<APValue> SrcEls;
+ SmallVector<QualType> SrcTypes;
+
+ if (!hlslElementwiseCastHelper(Info, E->getSubExpr(), E->getType(), SrcEls,
+ SrcTypes))
+ return false;
+
+ // cast the elements and construct our struct result
+ const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
+ if (!constructAggregate(Info, FPO, E, Result, E->getType(), SrcEls,
+ SrcTypes))
+ return false;
+
+ return true;
+ }
}
}
@@ -11349,6 +11773,38 @@ bool VectorExprEvaluator::VisitCastExpr(const CastExpr *E) {
Elements.push_back(Val.getVectorElt(I));
return Success(Elements, E);
}
+ case CK_HLSLAggregateSplatCast: {
+ APValue Val;
+ QualType ValTy;
+
+ if (!hlslAggSplatHelper(Info, SE, Val, ValTy))
+ return false;
+
+ // cast our Val once.
+ APValue Result;
+ const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
+ if (!handleScalarCast(Info, FPO, E, ValTy, VTy->getElementType(), Val,
+ Result))
+ return false;
+
+ SmallVector<APValue, 4> SplatEls(NElts, Result);
+ return Success(SplatEls, E);
+ }
+ case CK_HLSLElementwiseCast: {
+ SmallVector<APValue> SrcVals;
+ SmallVector<QualType> SrcTypes;
+
+ if (!hlslElementwiseCastHelper(Info, SE, E->getType(), SrcVals, SrcTypes))
+ return false;
+
+ const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
+ SmallVector<QualType, 4> DestTypes(NElts, VTy->getElementType());
+ SmallVector<APValue, 4> ResultEls(NElts);
+ if (!handleElementwiseCast(Info, E, FPO, SrcVals, SrcTypes, DestTypes,
+ ResultEls))
+ return false;
+ return Success(ResultEls, E);
+ }
default:
return ExprEvaluatorBaseTy::VisitCastExpr(E);
}
@@ -13316,6 +13772,7 @@ namespace {
bool VisitCallExpr(const CallExpr *E) {
return handleCallExpr(E, Result, &This);
}
+ bool VisitCastExpr(const CastExpr *E);
bool VisitInitListExpr(const InitListExpr *E,
QualType AllocType = QualType());
bool VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E);
@@ -13386,6 +13843,49 @@ static bool MaybeElementDependentArrayFiller(const Expr *FillerExpr) {
return true;
}
+bool ArrayExprEvaluator::VisitCastExpr(const CastExpr *E) {
+ const Expr *SE = E->getSubExpr();
+
+ switch (E->getCastKind()) {
+ default:
+ return ExprEvaluatorBaseTy::VisitCastExpr(E);
+ case CK_HLSLAggregateSplatCast: {
+ APValue Val;
+ QualType ValTy;
+
+ if (!hlslAggSplatHelper(Info, SE, Val, ValTy))
+ return false;
+
+ unsigned NEls = elementwiseSize(Info, E->getType());
+
+ SmallVector<APValue> SplatEls(NEls, Val);
+ SmallVector<QualType> SplatType(NEls, ValTy);
+
+ // cast the elements
+ const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
+ if (!constructAggregate(Info, FPO, E, Result, E->getType(), SplatEls,
+ SplatType))
+ return false;
+
+ return true;
+ }
+ case CK_HLSLElementwiseCast: {
+ SmallVector<APValue> SrcEls;
+ SmallVector<QualType> SrcTypes;
+
+ if (!hlslElementwiseCastHelper(Info, SE, E->getType(), SrcEls, SrcTypes))
+ return false;
+
+ // cast the elements
+ const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
+ if (!constructAggregate(Info, FPO, E, Result, E->getType(), SrcEls,
+ SrcTypes))
+ return false;
+ return true;
+ }
+ }
+}
+
bool ArrayExprEvaluator::VisitInitListExpr(const InitListExpr *E,
QualType AllocType) {
const ConstantArrayType *CAT = Info.Ctx.getAsConstantArrayType(
@@ -17192,7 +17692,6 @@ bool IntExprEvaluator::VisitCastExpr(const CastExpr *E) {
case CK_NoOp:
case CK_LValueToRValueBitCast:
case CK_HLSLArrayRValue:
- case CK_HLSLElementwiseCast:
return ExprEvaluatorBaseTy::VisitCastExpr(E);
case CK_MemberPointerToBoolean:
@@ -17339,6 +17838,21 @@ bool IntExprEvaluator::VisitCastExpr(const CastExpr *E) {
return Error(E);
return Success(Val.getVectorElt(0), E);
}
+ case CK_HLSLElementwiseCast: {
+ SmallVector<APValue> SrcVals;
+ SmallVector<QualType> SrcTypes;
+
+ if (!hlslElementwiseCastHelper(Info, SubExpr, DestType, SrcVals, SrcTypes))
+ return false;
+
+ // cast our single element
+ const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
+ APValue ResultVal;
+ if (!handleScalarCast(Info, FPO, E, SrcTypes[0], DestType, SrcVals[0],
+ ResultVal))
+ return false;
+ return Success(ResultVal, E);
+ }
}
llvm_unreachable("unknown cast resulting in integral value");
@@ -17876,6 +18390,9 @@ bool FloatExprEvaluator::VisitCastExpr(const CastExpr *E) {
default:
return ExprEvaluatorBaseTy::VisitCastExpr(E);
+ case CK_HLSLAggregateSplatCast:
+ llvm_unreachable("invalid cast kind for floating value");
+
case CK_IntegralToFloating: {
APSInt IntResult;
const FPOptions FPO = E->getFPFeaturesInEffect(
@@ -17914,6 +18431,23 @@ bool FloatExprEvaluator::VisitCastExpr(const CastExpr *E) {
return Error(E);
return Success(Val.getVectorElt(0), E);
}
+ case CK_HLSLElementwiseCast: {
+ SmallVector<APValue> SrcVals;
+ SmallVector<QualType> SrcTypes;
+
+ if (!hlslElementwiseCastHelper(Info, SubExpr, E->getType(), SrcVals,
+ SrcTypes))
+ return false;
+ APValue Val;
+
+ // cast our single element
+ const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
+ APValue ResultVal;
+ if (!handleScalarCast(Info, FPO, E, SrcTypes[0], E->getType(), SrcVals[0],
+ ResultVal))
+ return false;
+ return Success(ResultVal, E);
+ }
}
}
diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp
index 9f4dba9..89abf88 100644
--- a/clang/lib/AST/JSONNodeDumper.cpp
+++ b/clang/lib/AST/JSONNodeDumper.cpp
@@ -272,15 +272,13 @@ void JSONNodeDumper::writeIncludeStack(PresumedLoc Loc, bool JustFirst) {
JOS.attributeEnd();
}
-void JSONNodeDumper::writeBareSourceLocation(SourceLocation Loc,
- bool IsSpelling) {
+void JSONNodeDumper::writeBareSourceLocation(SourceLocation Loc) {
PresumedLoc Presumed = SM.getPresumedLoc(Loc);
- unsigned ActualLine = IsSpelling ? SM.getSpellingLineNumber(Loc)
- : SM.getExpansionLineNumber(Loc);
- StringRef ActualFile = SM.getBufferName(Loc);
-
if (Presumed.isValid()) {
- JOS.attribute("offset", SM.getDecomposedLoc(Loc).second);
+ StringRef ActualFile = SM.getBufferName(Loc);
+ auto [FID, FilePos] = SM.getDecomposedLoc(Loc);
+ unsigned ActualLine = SM.getLineNumber(FID, FilePos);
+ JOS.attribute("offset", FilePos);
if (LastLocFilename != ActualFile) {
JOS.attribute("file", ActualFile);
JOS.attribute("line", ActualLine);
@@ -318,18 +316,17 @@ void JSONNodeDumper::writeSourceLocation(SourceLocation Loc) {
if (Expansion != Spelling) {
// If the expansion and the spelling are different, output subobjects
// describing both locations.
- JOS.attributeObject("spellingLoc", [Spelling, this] {
- writeBareSourceLocation(Spelling, /*IsSpelling*/ true);
- });
+ JOS.attributeObject(
+ "spellingLoc", [Spelling, this] { writeBareSourceLocation(Spelling); });
JOS.attributeObject("expansionLoc", [Expansion, Loc, this] {
- writeBareSourceLocation(Expansion, /*IsSpelling*/ false);
+ writeBareSourceLocation(Expansion);
// If there is a macro expansion, add extra information if the interesting
// bit is the macro arg expansion.
if (SM.isMacroArgExpansion(Loc))
JOS.attribute("isMacroArgExpansion", true);
});
} else
- writeBareSourceLocation(Spelling, /*IsSpelling*/ true);
+ writeBareSourceLocation(Spelling);
}
void JSONNodeDumper::writeSourceRange(SourceRange R) {
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 549d720..41aebdb 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -2461,7 +2461,6 @@ void TextNodeDumper::VisitVarDecl(const VarDecl *D) {
break;
case VarDecl::ParenListInit:
OS << " parenlistinit";
- break;
}
}
if (D->needsDestruction(D->getASTContext()))
@@ -2469,19 +2468,6 @@ void TextNodeDumper::VisitVarDecl(const VarDecl *D) {
if (D->isParameterPack())
OS << " pack";
- VarDecl::DefinitionKind K = D->isThisDeclarationADefinition();
- switch (K) {
- case VarDecl::DefinitionKind::DeclarationOnly:
- OS << " declaration";
- break;
- case VarDecl::DefinitionKind::Definition:
- OS << " definition";
- break;
- case VarDecl::DefinitionKind::TentativeDefinition:
- OS << " tentative definition";
- break;
- }
-
if (const auto *Instance = D->getTemplateInstantiationPattern()) {
OS << " instantiated_from";
dumpPointer(Instance);
diff --git a/clang/lib/Analysis/ExprMutationAnalyzer.cpp b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
index 54c30c0..2f40c7e 100644
--- a/clang/lib/Analysis/ExprMutationAnalyzer.cpp
+++ b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
@@ -238,10 +238,12 @@ const auto isMoveOnly = [] {
};
template <class T> struct NodeID;
-template <> struct NodeID<Expr> { static constexpr StringRef value = "expr"; };
-template <> struct NodeID<Decl> { static constexpr StringRef value = "decl"; };
-constexpr StringRef NodeID<Expr>::value;
-constexpr StringRef NodeID<Decl>::value;
+template <> struct NodeID<Expr> {
+ static constexpr StringRef value = "expr";
+};
+template <> struct NodeID<Decl> {
+ static constexpr StringRef value = "decl";
+};
template <class T,
class F = const Stmt *(ExprMutationAnalyzer::Analyzer::*)(const T *)>
diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 8f09564..06d7380 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -58,9 +58,10 @@ enum PGOHashVersion : unsigned {
PGO_HASH_V1,
PGO_HASH_V2,
PGO_HASH_V3,
+ PGO_HASH_V4,
// Keep this set to the latest hash version.
- PGO_HASH_LATEST = PGO_HASH_V3
+ PGO_HASH_LATEST = PGO_HASH_V4
};
namespace {
@@ -152,7 +153,9 @@ static PGOHashVersion getPGOHashVersion(llvm::IndexedInstrProfReader *PGOReader,
return PGO_HASH_V1;
if (PGOReader->getVersion() <= 5)
return PGO_HASH_V2;
- return PGO_HASH_V3;
+ if (PGOReader->getVersion() <= 12)
+ return PGO_HASH_V3;
+ return PGO_HASH_V4;
}
/// A RecursiveASTVisitor that fills a map of statements to PGO counters.
@@ -1099,6 +1102,8 @@ void CodeGenPGO::mapRegionCounters(const Decl *D) {
assert(Walker.NextCounter > 0 && "no entry counter mapped for decl");
NumRegionCounters = Walker.NextCounter;
FunctionHash = Walker.Hash.finalize();
+ if (HashVersion >= PGO_HASH_V4)
+ FunctionHash &= llvm::NamedInstrProfRecord::FUNC_HASH_MASK;
}
bool CodeGenPGO::skipRegionMappingForDecl(const Decl *D) {
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 654a382..1a243fe 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -22,6 +22,7 @@
#include "llvm/Support/Process.h"
#include "llvm/Support/VirtualFileSystem.h"
#include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/TargetParser.h"
#include <optional>
#include <system_error>
@@ -1095,9 +1096,21 @@ bool AMDGPUToolChain::shouldSkipSanitizeOption(
if (K != SanitizerKind::Address)
return true;
+ // Check 'xnack+' availability by default
+ llvm::StringRef Processor =
+ getProcessorFromTargetID(TC.getTriple(), TargetID);
+ auto ProcKind = TC.getTriple().isAMDGCN()
+ ? llvm::AMDGPU::parseArchAMDGCN(Processor)
+ : llvm::AMDGPU::parseArchR600(Processor);
+ auto Features = TC.getTriple().isAMDGCN()
+ ? llvm::AMDGPU::getArchAttrAMDGCN(ProcKind)
+ : llvm::AMDGPU::getArchAttrR600(ProcKind);
+ if (Features & llvm::AMDGPU::FEATURE_XNACK_ALWAYS)
+ return false;
+
+ // Look for the xnack feature in TargetID
llvm::StringMap<bool> FeatureMap;
auto OptionalGpuArch = parseTargetID(TC.getTriple(), TargetID, &FeatureMap);
-
assert(OptionalGpuArch && "Invalid Target ID");
(void)OptionalGpuArch;
auto Loc = FeatureMap.find("xnack");
diff --git a/clang/lib/Driver/ToolChains/HLSL.cpp b/clang/lib/Driver/ToolChains/HLSL.cpp
index 8d3fba7..5d7221b 100644
--- a/clang/lib/Driver/ToolChains/HLSL.cpp
+++ b/clang/lib/Driver/ToolChains/HLSL.cpp
@@ -567,3 +567,7 @@ bool HLSLToolChain::isLastJob(DerivedArgList &Args,
// output to the result file.
return true;
}
+
+void HLSLToolChain::addClangWarningOptions(ArgStringList &CC1Args) const {
+ CC1Args.push_back("-Wconversion");
+}
diff --git a/clang/lib/Driver/ToolChains/HLSL.h b/clang/lib/Driver/ToolChains/HLSL.h
index 3aed904..5bf385e 100644
--- a/clang/lib/Driver/ToolChains/HLSL.h
+++ b/clang/lib/Driver/ToolChains/HLSL.h
@@ -91,6 +91,8 @@ public:
// Set default DWARF version to 4 for DXIL uses version 4.
unsigned GetDefaultDwarfVersion() const override { return 4; }
+ void addClangWarningOptions(llvm::opt::ArgStringList &CC1Args) const override;
+
private:
mutable std::unique_ptr<tools::hlsl::Validator> Validator;
mutable std::unique_ptr<tools::hlsl::MetalConverter> MetalConverter;
diff --git a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
index 208776e..2e2703d 100644
--- a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
@@ -1074,78 +1074,6 @@ _HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32)
float4 f16tof32(uint4);
//===----------------------------------------------------------------------===//
-// firstbithigh builtins
-//===----------------------------------------------------------------------===//
-
-/// \fn T firstbithigh(T Val)
-/// \brief Returns the location of the first set bit starting from the highest
-/// order bit and working downward, per component.
-/// \param Val the input value.
-
-#ifdef __HLSL_ENABLE_16_BIT
-_HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint firstbithigh(int16_t);
-_HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint2 firstbithigh(int16_t2);
-_HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint3 firstbithigh(int16_t3);
-_HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint4 firstbithigh(int16_t4);
-_HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint firstbithigh(uint16_t);
-_HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint2 firstbithigh(uint16_t2);
-_HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint3 firstbithigh(uint16_t3);
-_HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint4 firstbithigh(uint16_t4);
-#endif
-
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint firstbithigh(int);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint2 firstbithigh(int2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint3 firstbithigh(int3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint4 firstbithigh(int4);
-
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint firstbithigh(uint);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint2 firstbithigh(uint2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint3 firstbithigh(uint3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint4 firstbithigh(uint4);
-
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint firstbithigh(int64_t);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint2 firstbithigh(int64_t2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint3 firstbithigh(int64_t3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint4 firstbithigh(int64_t4);
-
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint firstbithigh(uint64_t);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint2 firstbithigh(uint64_t2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint3 firstbithigh(uint64_t3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint4 firstbithigh(uint64_t4);
-
-//===----------------------------------------------------------------------===//
// firstbitlow builtins
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h b/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h
index c877234..3d8fe7e 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h
@@ -148,6 +148,18 @@ template <typename T> constexpr T ldexp_impl(T X, T Exp) {
return exp2(Exp) * X;
}
+template <typename K, typename T, int BitWidth>
+constexpr K firstbithigh_impl(T X) {
+ K FBH = __builtin_hlsl_elementwise_firstbithigh(X);
+#if defined(__DIRECTX__)
+ // The firstbithigh DXIL ops count bits from the wrong side, so we need to
+ // invert it for DirectX.
+ K Inversion = (BitWidth - 1) - FBH;
+ FBH = select(FBH == -1, FBH, Inversion);
+#endif
+ return FBH;
+}
+
} // namespace __detail
} // namespace hlsl
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 5ba5bfb..33ed143 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -262,6 +262,67 @@ faceforward(__detail::HLSL_FIXED_VECTOR<float, L> N,
}
//===----------------------------------------------------------------------===//
+// firstbithigh builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn T firstbithigh(T Val)
+/// \brief Returns the location of the first set bit starting from the lowest
+/// order bit and working upward, per component.
+/// \param Val the input value.
+
+#ifdef __HLSL_ENABLE_16_BIT
+
+template <typename T>
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+const inline __detail::enable_if_t<__detail::is_same<int16_t, T>::value ||
+ __detail::is_same<uint16_t, T>::value,
+ uint> firstbithigh(T X) {
+ return __detail::firstbithigh_impl<uint, T, 16>(X);
+}
+
+template <typename T, int N>
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+const
+ inline __detail::enable_if_t<__detail::is_same<int16_t, T>::value ||
+ __detail::is_same<uint16_t, T>::value,
+ vector<uint, N>> firstbithigh(vector<T, N> X) {
+ return __detail::firstbithigh_impl<vector<uint, N>, vector<T, N>, 16>(X);
+}
+
+#endif
+
+template <typename T>
+const inline __detail::enable_if_t<
+ __detail::is_same<int, T>::value || __detail::is_same<uint, T>::value, uint>
+firstbithigh(T X) {
+ return __detail::firstbithigh_impl<uint, T, 32>(X);
+}
+
+template <typename T, int N>
+const inline __detail::enable_if_t<__detail::is_same<int, T>::value ||
+ __detail::is_same<uint, T>::value,
+ vector<uint, N>>
+firstbithigh(vector<T, N> X) {
+ return __detail::firstbithigh_impl<vector<uint, N>, vector<T, N>, 32>(X);
+}
+
+template <typename T>
+const inline __detail::enable_if_t<__detail::is_same<int64_t, T>::value ||
+ __detail::is_same<uint64_t, T>::value,
+ uint>
+firstbithigh(T X) {
+ return __detail::firstbithigh_impl<uint, T, 64>(X);
+}
+
+template <typename T, int N>
+const inline __detail::enable_if_t<__detail::is_same<int64_t, T>::value ||
+ __detail::is_same<uint64_t, T>::value,
+ vector<uint, N>>
+firstbithigh(vector<T, N> X) {
+ return __detail::firstbithigh_impl<vector<uint, N>, vector<T, N>, 64>(X);
+}
+
+//===----------------------------------------------------------------------===//
// fmod builtins
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index cde354c..7633806 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -394,36 +394,48 @@ Interpreter::outOfProcessJITBuilder(JITConfig Config) {
llvm::Expected<std::string>
Interpreter::getOrcRuntimePath(const driver::ToolChain &TC) {
- std::optional<std::string> CompilerRTPath = TC.getCompilerRTPath();
- std::optional<std::string> ResourceDir = TC.getRuntimePath();
+ const std::array<const char *, 3> OrcRTLibNames = {
+ "liborc_rt.a", "liborc_rt_osx.a", "liborc_rt-x86_64.a"};
+
+ auto findInDir = [&](llvm::StringRef Base) -> std::optional<std::string> {
+ for (const char *LibName : OrcRTLibNames) {
+ llvm::SmallString<256> CandidatePath(Base);
+ llvm::sys::path::append(CandidatePath, LibName);
+ if (llvm::sys::fs::exists(CandidatePath))
+ return std::string(CandidatePath.str());
+ }
+ return std::nullopt;
+ };
+
+ std::string SearchedPaths;
- if (!CompilerRTPath) {
+ if (std::optional<std::string> CompilerRTPath = TC.getCompilerRTPath()) {
+ if (auto Found = findInDir(*CompilerRTPath))
+ return *Found;
+ SearchedPaths += *CompilerRTPath;
+ } else {
return llvm::make_error<llvm::StringError>("CompilerRT path not found",
std::error_code());
}
- const std::array<const char *, 3> OrcRTLibNames = {
- "liborc_rt.a", "liborc_rt_osx.a", "liborc_rt-x86_64.a"};
-
- for (const char *LibName : OrcRTLibNames) {
- llvm::SmallString<256> CandidatePath((*CompilerRTPath).c_str());
- llvm::sys::path::append(CandidatePath, LibName);
-
- if (llvm::sys::fs::exists(CandidatePath)) {
- return CandidatePath.str().str();
- }
+ if (std::optional<std::string> ResourceDir = TC.getRuntimePath()) {
+ if (auto Found = findInDir(*ResourceDir))
+ return *Found;
+ if (!SearchedPaths.empty())
+ SearchedPaths += "; ";
+ SearchedPaths += *ResourceDir;
+ } else {
+ return llvm::make_error<llvm::StringError>("ResourceDir path not found",
+ std::error_code());
}
return llvm::make_error<llvm::StringError>(
- llvm::Twine("OrcRuntime library not found in: ") + (*CompilerRTPath),
+ llvm::Twine("OrcRuntime library not found in: ") + SearchedPaths,
std::error_code());
}
llvm::Expected<std::unique_ptr<Interpreter>>
Interpreter::create(std::unique_ptr<CompilerInstance> CI, JITConfig Config) {
- llvm::Error Err = llvm::Error::success();
-
- std::unique_ptr<llvm::orc::LLJITBuilder> JB;
if (Config.IsOutOfProcess) {
const TargetInfo &TI = CI->getTarget();
@@ -453,6 +465,9 @@ Interpreter::create(std::unique_ptr<CompilerInstance> CI, JITConfig Config) {
}
}
+ llvm::Error Err = llvm::Error::success();
+ std::unique_ptr<llvm::orc::LLJITBuilder> JB;
+
auto Interp = std::unique_ptr<Interpreter>(new Interpreter(
std::move(CI), Err, std::move(JB), /*Consumer=*/nullptr, Config));
if (auto E = std::move(Err))
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index 74f87a8..7a5d28c 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -772,9 +772,11 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
// Produce a diagnostic if we're not tentatively parsing; otherwise track
// that our parse has failed.
- auto Invalid = [&](llvm::function_ref<void()> Action) {
+ auto Result = [&](llvm::function_ref<void()> Action,
+ LambdaIntroducerTentativeParse State =
+ LambdaIntroducerTentativeParse::Invalid) {
if (Tentative) {
- *Tentative = LambdaIntroducerTentativeParse::Invalid;
+ *Tentative = State;
return false;
}
Action();
@@ -824,7 +826,7 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
break;
}
- return Invalid([&] {
+ return Result([&] {
Diag(Tok.getLocation(), diag::err_expected_comma_or_rsquare);
});
}
@@ -861,7 +863,7 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
ConsumeToken();
Kind = LCK_StarThis;
} else {
- return Invalid([&] {
+ return Result([&] {
Diag(Tok.getLocation(), diag::err_expected_star_this_capture);
});
}
@@ -875,8 +877,9 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
// or the start of a capture (in the "&" case) with the rest of the
// capture missing. Both are an error but a misplaced capture-default
// is more likely if we don't already have a capture default.
- return Invalid(
- [&] { Diag(Tok.getLocation(), diag::err_capture_default_first); });
+ return Result(
+ [&] { Diag(Tok.getLocation(), diag::err_capture_default_first); },
+ LambdaIntroducerTentativeParse::Incomplete);
} else {
TryConsumeToken(tok::ellipsis, EllipsisLocs[0]);
@@ -899,14 +902,13 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
Id = Tok.getIdentifierInfo();
Loc = ConsumeToken();
} else if (Tok.is(tok::kw_this)) {
- return Invalid([&] {
+ return Result([&] {
// FIXME: Suggest a fixit here.
Diag(Tok.getLocation(), diag::err_this_captured_by_reference);
});
} else {
- return Invalid([&] {
- Diag(Tok.getLocation(), diag::err_expected_capture);
- });
+ return Result(
+ [&] { Diag(Tok.getLocation(), diag::err_expected_capture); });
}
TryConsumeToken(tok::ellipsis, EllipsisLocs[2]);
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 23bf7f2..46addea 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -321,9 +321,8 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer,
static_cast<unsigned>(ComparisonCategoryType::Last) + 1),
StdSourceLocationImplDecl(nullptr), CXXTypeInfoDecl(nullptr),
GlobalNewDeleteDeclared(false), DisableTypoCorrection(false),
- TyposCorrected(0), IsBuildingRecoveryCallExpr(false), NumSFINAEErrors(0),
- AccessCheckingSFINAE(false), CurrentInstantiationScope(nullptr),
- InNonInstantiationSFINAEContext(false), NonInstantiationEntries(0),
+ TyposCorrected(0), IsBuildingRecoveryCallExpr(false),
+ CurrentInstantiationScope(nullptr), NonInstantiationEntries(0),
ArgPackSubstIndex(std::nullopt), SatisfactionCache(Context) {
assert(pp.TUKind == TUKind);
TUScope = nullptr;
@@ -670,7 +669,9 @@ void Sema::addExternalSource(IntrusiveRefCntPtr<ExternalSemaSource> E) {
void Sema::PrintStats() const {
llvm::errs() << "\n*** Semantic Analysis Stats:\n";
- llvm::errs() << NumSFINAEErrors << " SFINAE diagnostics trapped.\n";
+ if (SFINAETrap *Trap = getSFINAEContext())
+ llvm::errs() << int(Trap->hasErrorOccurred())
+ << " SFINAE diagnostics trapped.\n";
BumpAlloc.PrintStats();
AnalysisWarnings.PrintStats();
@@ -1681,7 +1682,8 @@ void Sema::EmitDiagnostic(unsigned DiagID, const DiagnosticBuilder &DB) {
// issue I am not seeing yet), then there should at least be a clarifying
// comment somewhere.
Diagnostic DiagInfo(&Diags, DB);
- if (std::optional<TemplateDeductionInfo *> Info = isSFINAEContext()) {
+ if (SFINAETrap *Trap = getSFINAEContext()) {
+ sema::TemplateDeductionInfo *Info = Trap->getDeductionInfo();
switch (DiagnosticIDs::getDiagnosticSFINAEResponse(DiagInfo.getID())) {
case DiagnosticIDs::SFINAE_Report:
// We'll report the diagnostic below.
@@ -1690,37 +1692,37 @@ void Sema::EmitDiagnostic(unsigned DiagID, const DiagnosticBuilder &DB) {
case DiagnosticIDs::SFINAE_SubstitutionFailure:
// Count this failure so that we know that template argument deduction
// has failed.
- ++NumSFINAEErrors;
+ Trap->setErrorOccurred();
// Make a copy of this suppressed diagnostic and store it with the
// template-deduction information.
- if (*Info && !(*Info)->hasSFINAEDiagnostic()) {
- (*Info)->addSFINAEDiagnostic(DiagInfo.getLocation(),
- PartialDiagnostic(DiagInfo, Context.getDiagAllocator()));
- }
+ if (Info && !Info->hasSFINAEDiagnostic())
+ Info->addSFINAEDiagnostic(
+ DiagInfo.getLocation(),
+ PartialDiagnostic(DiagInfo, Context.getDiagAllocator()));
Diags.setLastDiagnosticIgnored(true);
return;
case DiagnosticIDs::SFINAE_AccessControl: {
// Per C++ Core Issue 1170, access control is part of SFINAE.
- // Additionally, the AccessCheckingSFINAE flag can be used to temporarily
+ // Additionally, the WithAccessChecking flag can be used to temporarily
// make access control a part of SFINAE for the purposes of checking
// type traits.
- if (!AccessCheckingSFINAE && !getLangOpts().CPlusPlus11)
+ if (!Trap->withAccessChecking() && !getLangOpts().CPlusPlus11)
break;
SourceLocation Loc = DiagInfo.getLocation();
// Suppress this diagnostic.
- ++NumSFINAEErrors;
+ Trap->setErrorOccurred();
// Make a copy of this suppressed diagnostic and store it with the
// template-deduction information.
- if (*Info && !(*Info)->hasSFINAEDiagnostic()) {
- (*Info)->addSFINAEDiagnostic(DiagInfo.getLocation(),
- PartialDiagnostic(DiagInfo, Context.getDiagAllocator()));
- }
+ if (Info && !Info->hasSFINAEDiagnostic())
+ Info->addSFINAEDiagnostic(
+ DiagInfo.getLocation(),
+ PartialDiagnostic(DiagInfo, Context.getDiagAllocator()));
Diags.setLastDiagnosticIgnored(true);
@@ -1740,13 +1742,13 @@ void Sema::EmitDiagnostic(unsigned DiagID, const DiagnosticBuilder &DB) {
return;
// Make a copy of this suppressed diagnostic and store it with the
// template-deduction information;
- if (*Info) {
- (*Info)->addSuppressedDiagnostic(
+ if (Info) {
+ Info->addSuppressedDiagnostic(
DiagInfo.getLocation(),
PartialDiagnostic(DiagInfo, Context.getDiagAllocator()));
if (!Diags.getDiagnosticIDs()->isNote(DiagID))
PrintContextStack([Info](SourceLocation Loc, PartialDiagnostic PD) {
- (*Info)->addSuppressedDiagnostic(Loc, std::move(PD));
+ Info->addSuppressedDiagnostic(Loc, std::move(PD));
});
}
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 139c4ab..cece220 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -558,6 +558,8 @@ AMDGPUMaxNumWorkGroupsAttr *SemaAMDGPU::CreateAMDGPUMaxNumWorkGroupsAttr(
const AttributeCommonInfo &CI, Expr *XExpr, Expr *YExpr, Expr *ZExpr) {
ASTContext &Context = getASTContext();
AMDGPUMaxNumWorkGroupsAttr TmpAttr(Context, CI, XExpr, YExpr, ZExpr);
+ assert(!SemaRef.isSFINAEContext() &&
+ "Can't produce SFINAE diagnostic pointing to temporary attribute");
if (checkAMDGPUMaxNumWorkGroupsArguments(SemaRef, XExpr, YExpr, ZExpr,
TmpAttr))
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index fb4d0b45..883e341 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -526,12 +526,12 @@ ExprResult ConstraintSatisfactionChecker::EvaluateAtomicConstraint(
S, AtomicExpr->getBeginLoc(),
Sema::InstantiatingTemplate::ConstraintSubstitution{},
// FIXME: improve const-correctness of InstantiatingTemplate
- const_cast<NamedDecl *>(Template), Info, AtomicExpr->getSourceRange());
+ const_cast<NamedDecl *>(Template), AtomicExpr->getSourceRange());
if (Inst.isInvalid())
return ExprError();
// We do not want error diagnostics escaping here.
- Sema::SFINAETrap Trap(S);
+ Sema::SFINAETrap Trap(S, Info);
SubstitutedExpression =
S.SubstConstraintExpr(const_cast<Expr *>(AtomicExpr), MLTAL);
@@ -599,16 +599,15 @@ ConstraintSatisfactionChecker::SubstitutionInTemplateArguments(
return MultiLevelTemplateArgumentList();
TemplateDeductionInfo Info(Constraint.getBeginLoc());
+ Sema::SFINAETrap Trap(S, Info);
Sema::InstantiatingTemplate Inst(
S, Constraint.getBeginLoc(),
Sema::InstantiatingTemplate::ConstraintSubstitution{},
// FIXME: improve const-correctness of InstantiatingTemplate
- const_cast<NamedDecl *>(Template), Info, Constraint.getSourceRange());
+ const_cast<NamedDecl *>(Template), Constraint.getSourceRange());
if (Inst.isInvalid())
return std::nullopt;
- Sema::SFINAETrap Trap(S);
-
TemplateArgumentListInfo SubstArgs;
Sema::ArgPackSubstIndexRAII SubstIndex(
S, Constraint.getPackSubstitutionIndex()
@@ -778,9 +777,6 @@ ConstraintSatisfactionChecker::EvaluateFoldExpandedConstraintSize(
const FoldExpandedConstraint &FE,
const MultiLevelTemplateArgumentList &MLTAL) {
- // We should ignore errors in the presence of packs of different size.
- Sema::SFINAETrap Trap(S);
-
Expr *Pattern = const_cast<Expr *>(FE.getPattern());
SmallVector<UnexpandedParameterPack, 2> Unexpanded;
@@ -792,18 +788,12 @@ ConstraintSatisfactionChecker::EvaluateFoldExpandedConstraintSize(
if (S.CheckParameterPacksForExpansion(
Pattern->getExprLoc(), Pattern->getSourceRange(), Unexpanded, MLTAL,
/*FailOnPackProducingTemplates=*/false, Expand, RetainExpansion,
- NumExpansions) ||
+ NumExpansions, /*Diagnose=*/false) ||
!Expand || RetainExpansion)
return std::nullopt;
- if (NumExpansions && S.getLangOpts().BracketDepth < *NumExpansions) {
- S.Diag(Pattern->getExprLoc(),
- clang::diag::err_fold_expression_limit_exceeded)
- << *NumExpansions << S.getLangOpts().BracketDepth
- << Pattern->getSourceRange();
- S.Diag(Pattern->getExprLoc(), diag::note_bracket_depth);
+ if (NumExpansions && S.getLangOpts().BracketDepth < *NumExpansions)
return std::nullopt;
- }
return NumExpansions;
}
@@ -921,7 +911,6 @@ ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
return ExprError();
}
- Sema::SFINAETrap Trap(S);
Sema::ArgPackSubstIndexRAII SubstIndex(
S, Constraint.getPackSubstitutionIndex()
? Constraint.getPackSubstitutionIndex()
@@ -930,9 +919,10 @@ ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
const ASTTemplateArgumentListInfo *Ori =
ConceptId->getTemplateArgsAsWritten();
TemplateDeductionInfo Info(TemplateNameLoc);
- Sema::InstantiatingTemplate _(
+ Sema::SFINAETrap Trap(S, Info);
+ Sema::InstantiatingTemplate _2(
S, TemplateNameLoc, Sema::InstantiatingTemplate::ConstraintSubstitution{},
- const_cast<NamedDecl *>(Template), Info, Constraint.getSourceRange());
+ const_cast<NamedDecl *>(Template), Constraint.getSourceRange());
TemplateArgumentListInfo OutArgs(Ori->LAngleLoc, Ori->RAngleLoc);
if (S.SubstTemplateArguments(Ori->arguments(), *SubstitutedArgs, OutArgs) ||
@@ -1142,13 +1132,21 @@ static bool CheckConstraintSatisfaction(
if (TemplateArgsLists.getNumLevels() != 0)
Args = TemplateArgsLists.getInnermost();
- std::optional<Sema::InstantiatingTemplate> SynthesisContext;
- if (!TopLevelConceptId) {
- SynthesisContext.emplace(S, TemplateIDRange.getBegin(),
- Sema::InstantiatingTemplate::ConstraintsCheck{},
- const_cast<NamedDecl *>(Template), Args,
+ struct SynthesisContextPair {
+ Sema::InstantiatingTemplate Inst;
+ Sema::NonSFINAEContext NSC;
+ SynthesisContextPair(Sema &S, NamedDecl *Template,
+ ArrayRef<TemplateArgument> TemplateArgs,
+ SourceRange InstantiationRange)
+ : Inst(S, InstantiationRange.getBegin(),
+ Sema::InstantiatingTemplate::ConstraintsCheck{}, Template,
+ TemplateArgs, InstantiationRange),
+ NSC(S) {}
+ };
+ std::optional<SynthesisContextPair> SynthesisContext;
+ if (!TopLevelConceptId)
+ SynthesisContext.emplace(S, const_cast<NamedDecl *>(Template), Args,
TemplateIDRange);
- }
const NormalizedConstraint *C =
S.getNormalizedAssociatedConstraints(Template, AssociatedConstraints);
@@ -1478,8 +1476,7 @@ static const Expr *SubstituteConstraintExpressionWithoutSatisfaction(
if (MLTAL.getNumSubstitutedLevels() == 0)
return ConstrExpr;
- Sema::SFINAETrap SFINAE(S);
-
+ Sema::NonSFINAEContext _(S);
Sema::InstantiatingTemplate Inst(
S, DeclInfo.getLocation(),
Sema::InstantiatingTemplate::ConstraintNormalization{},
@@ -1554,7 +1551,7 @@ static const Expr *SubstituteConstraintExpressionWithoutSatisfaction(
Sema::ReuseLambdaContextDecl);
ExprResult SubstConstr = S.SubstConstraintExprWithoutSatisfaction(
const_cast<clang::Expr *>(ConstrExpr), MLTAL);
- if (SFINAE.hasErrorOccurred() || !SubstConstr.isUsable())
+ if (!SubstConstr.isUsable())
return nullptr;
return SubstConstr.get();
}
@@ -2104,6 +2101,7 @@ bool SubstituteParameterMappings::substitute(
InstLocBegin = SR.getBegin();
InstLocEnd = SR.getEnd();
}
+ Sema::NonSFINAEContext _(SemaRef);
Sema::InstantiatingTemplate Inst(
SemaRef, InstLocBegin,
Sema::InstantiatingTemplate::ParameterMappingSubstitution{},
@@ -2171,6 +2169,7 @@ bool SubstituteParameterMappings::substitute(ConceptIdConstraint &CC) {
InstLocBegin = SR.getBegin();
InstLocEnd = SR.getEnd();
}
+ Sema::NonSFINAEContext _(SemaRef);
// This is useful for name lookup across modules; see Sema::getLookupModules.
Sema::InstantiatingTemplate Inst(
SemaRef, InstLocBegin,
@@ -2311,6 +2310,7 @@ NormalizedConstraint *NormalizedConstraint::fromConstraintExpr(
} else if (auto *CSE = dyn_cast<const ConceptSpecializationExpr>(E)) {
NormalizedConstraint *SubNF;
{
+ Sema::NonSFINAEContext _(S);
Sema::InstantiatingTemplate Inst(
S, CSE->getExprLoc(),
Sema::InstantiatingTemplate::ConstraintNormalization{},
@@ -2546,8 +2546,6 @@ bool Sema::MaybeEmitAmbiguousAtomicConstraintsDiagnostic(
};
{
- // The subsumption checks might cause diagnostics
- SFINAETrap Trap(*this);
auto *Normalized1 = getNormalizedAssociatedConstraints(D1, AC1);
if (!Normalized1)
return false;
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index fc3aabf..086dd8b 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -8492,12 +8492,11 @@ void Sema::CheckShadow(NamedDecl *D, NamedDecl *ShadowedDecl,
DeclContext *NewDC = D->getDeclContext();
if (FieldDecl *FD = dyn_cast<FieldDecl>(ShadowedDecl)) {
- if (CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(NewDC)) {
- // Fields are not shadowed by variables in C++ static methods.
- if (MD->isStatic())
- return;
-
- if (!MD->getParent()->isLambda() && MD->isExplicitObjectMemberFunction())
+ if (const auto *MD =
+ dyn_cast<CXXMethodDecl>(getFunctionLevelDeclContext())) {
+ // Fields aren't shadowed in C++ static members or in member functions
+ // with an explicit object parameter.
+ if (MD->isStatic() || MD->isExplicitObjectMemberFunction())
return;
}
// Fields shadowed by constructor parameters are a special case. Usually
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index a50c276..3eb935c 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -12653,10 +12653,10 @@ QualType Sema::CheckCompareOperands(ExprResult &LHS, ExprResult &RHS,
// This is a gcc extension compatibility comparison.
// In a SFINAE context, we treat this as a hard error to maintain
// conformance with the C++ standard.
- diagnoseFunctionPointerToVoidComparison(
- *this, Loc, LHS, RHS, /*isError*/ (bool)isSFINAEContext());
+ bool IsError = isSFINAEContext();
+ diagnoseFunctionPointerToVoidComparison(*this, Loc, LHS, RHS, IsError);
- if (isSFINAEContext())
+ if (IsError)
return QualType();
RHS = ImpCastExprToType(RHS.get(), LHSType, CK_BitCast);
@@ -14598,11 +14598,11 @@ QualType Sema::CheckAddressOfOperand(ExprResult &OrigOp, SourceLocation OpLoc) {
unsigned AddressOfError = AO_No_Error;
if (lval == Expr::LV_ClassTemporary || lval == Expr::LV_ArrayTemporary) {
- bool sfinae = (bool)isSFINAEContext();
- Diag(OpLoc, isSFINAEContext() ? diag::err_typecheck_addrof_temporary
- : diag::ext_typecheck_addrof_temporary)
- << op->getType() << op->getSourceRange();
- if (sfinae)
+ bool IsError = isSFINAEContext();
+ Diag(OpLoc, IsError ? diag::err_typecheck_addrof_temporary
+ : diag::ext_typecheck_addrof_temporary)
+ << op->getType() << op->getSourceRange();
+ if (IsError)
return QualType();
// Materialize the temporary as an lvalue so that we can take its address.
OrigOp = op =
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 983a784..4a9e1bc 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -3846,13 +3846,14 @@ QualType Sema::CheckTemplateIdType(ElaboratedTypeKeyword Keyword,
// within enable_if in a SFINAE context, dig out the specific
// enable_if condition that failed and present that instead.
if (isEnableIfAliasTemplate(AliasTemplate)) {
- if (auto DeductionInfo = isSFINAEContext()) {
- if (*DeductionInfo &&
- (*DeductionInfo)->hasSFINAEDiagnostic() &&
- (*DeductionInfo)->peekSFINAEDiagnostic().second.getDiagID() ==
- diag::err_typename_nested_not_found_enable_if &&
- TemplateArgs[0].getArgument().getKind()
- == TemplateArgument::Expression) {
+ if (SFINAETrap *Trap = getSFINAEContext();
+ TemplateDeductionInfo *DeductionInfo =
+ Trap ? Trap->getDeductionInfo() : nullptr) {
+ if (DeductionInfo->hasSFINAEDiagnostic() &&
+ DeductionInfo->peekSFINAEDiagnostic().second.getDiagID() ==
+ diag::err_typename_nested_not_found_enable_if &&
+ TemplateArgs[0].getArgument().getKind() ==
+ TemplateArgument::Expression) {
Expr *FailedCond;
std::string FailedDescription;
std::tie(FailedCond, FailedDescription) =
@@ -3861,15 +3862,14 @@ QualType Sema::CheckTemplateIdType(ElaboratedTypeKeyword Keyword,
// Remove the old SFINAE diagnostic.
PartialDiagnosticAt OldDiag =
{SourceLocation(), PartialDiagnostic::NullDiagnostic()};
- (*DeductionInfo)->takeSFINAEDiagnostic(OldDiag);
+ DeductionInfo->takeSFINAEDiagnostic(OldDiag);
// Add a new SFINAE diagnostic specifying which condition
// failed.
- (*DeductionInfo)->addSFINAEDiagnostic(
- OldDiag.first,
- PDiag(diag::err_typename_nested_not_found_requirement)
- << FailedDescription
- << FailedCond->getSourceRange());
+ DeductionInfo->addSFINAEDiagnostic(
+ OldDiag.first,
+ PDiag(diag::err_typename_nested_not_found_requirement)
+ << FailedDescription << FailedCond->getSourceRange());
}
}
}
@@ -3955,6 +3955,7 @@ QualType Sema::CheckTemplateIdType(ElaboratedTypeKeyword Keyword,
if (Decl->getSpecializationKind() == TSK_Undeclared &&
ClassTemplate->getTemplatedDecl()->hasAttrs()) {
+ NonSFINAEContext _(*this);
InstantiatingTemplate Inst(*this, TemplateLoc, Decl);
if (!Inst.isInvalid()) {
MultiLevelTemplateArgumentList TemplateArgLists(Template,
@@ -5565,12 +5566,11 @@ bool Sema::CheckTemplateArgument(NamedDecl *Param, TemplateArgumentLoc &ArgLoc,
auto checkExpr = [&](Expr *E) -> Expr * {
TemplateArgument SugaredResult, CanonicalResult;
- unsigned CurSFINAEErrors = NumSFINAEErrors;
ExprResult Res = CheckTemplateArgument(
NTTP, NTTPType, E, SugaredResult, CanonicalResult,
/*StrictCheck=*/CTAI.MatchingTTP || CTAI.PartialOrdering, CTAK);
// If the current template argument causes an error, give up now.
- if (Res.isInvalid() || CurSFINAEErrors < NumSFINAEErrors)
+ if (Res.isInvalid())
return nullptr;
CTAI.SugaredConverted.push_back(SugaredResult);
CTAI.CanonicalConverted.push_back(CanonicalResult);
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 6964242..a287319 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -3239,10 +3239,6 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
ArrayRef<TemplateArgumentLoc> Ps, ArrayRef<TemplateArgument> As,
SmallVectorImpl<DeducedTemplateArgument> &Deduced,
TemplateDeductionInfo &Info, bool CopyDeducedArgs) {
- // Unevaluated SFINAE context.
- EnterExpressionEvaluationContext Unevaluated(
- S, Sema::ExpressionEvaluationContext::Unevaluated);
-
Sema::ContextRAII SavedContext(S, getAsDeclContextOrEnclosing(Entity));
// C++ [temp.deduct.type]p2:
@@ -3380,10 +3376,6 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
Sema &S, TemplateDecl *TD,
SmallVectorImpl<DeducedTemplateArgument> &Deduced,
TemplateDeductionInfo &Info) {
- // Unevaluated SFINAE context.
- EnterExpressionEvaluationContext Unevaluated(
- S, Sema::ExpressionEvaluationContext::Unevaluated);
-
Sema::ContextRAII SavedContext(S, getAsDeclContextOrEnclosing(TD));
// C++ [temp.deduct.type]p2:
@@ -3423,7 +3415,7 @@ DeduceTemplateArguments(Sema &S, T *Partial,
// Unevaluated SFINAE context.
EnterExpressionEvaluationContext Unevaluated(
S, Sema::ExpressionEvaluationContext::Unevaluated);
- Sema::SFINAETrap Trap(S);
+ Sema::SFINAETrap Trap(S, Info);
// This deduction has no relation to any outer instantiation we might be
// performing.
@@ -3441,8 +3433,7 @@ DeduceTemplateArguments(Sema &S, T *Partial,
return Result;
SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(), Deduced.end());
- Sema::InstantiatingTemplate Inst(S, Info.getLocation(), Partial, DeducedArgs,
- Info);
+ Sema::InstantiatingTemplate Inst(S, Info.getLocation(), Partial, DeducedArgs);
if (Inst.isInvalid())
return TemplateDeductionResult::InstantiationDepth;
@@ -3497,7 +3488,7 @@ Sema::DeduceTemplateArgumentsFromType(TemplateDecl *TD, QualType FromType,
// Unevaluated SFINAE context.
EnterExpressionEvaluationContext Unevaluated(
*this, Sema::ExpressionEvaluationContext::Unevaluated);
- SFINAETrap Trap(*this);
+ SFINAETrap Trap(*this, Info);
// This deduction has no relation to any outer instantiation we might be
// performing.
@@ -3514,7 +3505,7 @@ Sema::DeduceTemplateArgumentsFromType(TemplateDecl *TD, QualType FromType,
}
SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(), Deduced.end());
- InstantiatingTemplate Inst(*this, Info.getLocation(), TD, DeducedArgs, Info);
+ InstantiatingTemplate Inst(*this, Info.getLocation(), TD, DeducedArgs);
if (Inst.isInvalid())
return TemplateDeductionResult::InstantiationDepth;
@@ -3558,6 +3549,9 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
SmallVectorImpl<DeducedTemplateArgument> &Deduced,
SmallVectorImpl<QualType> &ParamTypes, QualType *FunctionType,
TemplateDeductionInfo &Info) {
+ assert(isSFINAEContext());
+ assert(isUnevaluatedContext());
+
FunctionDecl *Function = FunctionTemplate->getTemplatedDecl();
TemplateParameterList *TemplateParams
= FunctionTemplate->getTemplateParameters();
@@ -3573,11 +3567,6 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
return TemplateDeductionResult::Success;
}
- // Unevaluated SFINAE context.
- EnterExpressionEvaluationContext Unevaluated(
- *this, Sema::ExpressionEvaluationContext::Unevaluated);
- SFINAETrap Trap(*this);
-
// C++ [temp.arg.explicit]p3:
// Template arguments that are present shall be specified in the
// declaration order of their corresponding template-parameters. The
@@ -3590,7 +3579,7 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
SmallVector<TemplateArgument, 4> DeducedArgs;
InstantiatingTemplate Inst(
*this, Info.getLocation(), FunctionTemplate, DeducedArgs,
- CodeSynthesisContext::ExplicitTemplateArgumentSubstitution, Info);
+ CodeSynthesisContext::ExplicitTemplateArgumentSubstitution);
if (Inst.isInvalid())
return TemplateDeductionResult::InstantiationDepth;
@@ -3598,8 +3587,7 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
if (CheckTemplateArgumentList(FunctionTemplate, SourceLocation(),
ExplicitTemplateArgs, /*DefaultArgs=*/{},
/*PartialTemplateArgs=*/true, CTAI,
- /*UpdateArgsWithConversions=*/false) ||
- Trap.hasErrorOccurred()) {
+ /*UpdateArgsWithConversions=*/false)) {
unsigned Index = CTAI.SugaredConverted.size();
if (Index >= TemplateParams->size())
return TemplateDeductionResult::SubstitutionFailure;
@@ -3688,7 +3676,7 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
ResultType =
SubstType(Proto->getReturnType(), MLTAL,
Function->getTypeSpecStartLoc(), Function->getDeclName());
- if (ResultType.isNull() || Trap.hasErrorOccurred())
+ if (ResultType.isNull())
return TemplateDeductionResult::SubstitutionFailure;
// CUDA: Kernel function must have 'void' return type.
if (getLangOpts().CUDA)
@@ -3714,7 +3702,7 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
Function->getLocation(),
Function->getDeclName(),
EPI);
- if (FunctionType->isNull() || Trap.hasErrorOccurred())
+ if (FunctionType->isNull())
return TemplateDeductionResult::SubstitutionFailure;
}
@@ -3912,12 +3900,15 @@ static TemplateDeductionResult instantiateExplicitSpecifierDeferred(
if (!ExplicitExpr->isValueDependent())
return TemplateDeductionResult::Success;
+ // By this point, FinishTemplateArgumentDeduction will have been reverted back
+ // to a regular non-SFINAE template instantiation context, so setup a new
+ // SFINAE context.
Sema::InstantiatingTemplate Inst(
S, Info.getLocation(), FunctionTemplate, DeducedArgs,
- Sema::CodeSynthesisContext::DeducedTemplateArgumentSubstitution, Info);
+ Sema::CodeSynthesisContext::DeducedTemplateArgumentSubstitution);
if (Inst.isInvalid())
return TemplateDeductionResult::InstantiationDepth;
- Sema::SFINAETrap Trap(S);
+ Sema::SFINAETrap Trap(S, Info);
const ExplicitSpecifier InstantiatedES =
S.instantiateExplicitSpecifier(SubstArgs, ES);
if (InstantiatedES.isInvalid() || Trap.hasErrorOccurred()) {
@@ -3937,17 +3928,12 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
bool PartialOverloading, bool PartialOrdering,
bool ForOverloadSetAddressResolution,
llvm::function_ref<bool(bool)> CheckNonDependent) {
- // Unevaluated SFINAE context.
- EnterExpressionEvaluationContext Unevaluated(
- *this, Sema::ExpressionEvaluationContext::Unevaluated);
- SFINAETrap Trap(*this);
-
// Enter a new template instantiation context while we instantiate the
// actual function declaration.
SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(), Deduced.end());
InstantiatingTemplate Inst(
*this, Info.getLocation(), FunctionTemplate, DeducedArgs,
- CodeSynthesisContext::DeducedTemplateArgumentSubstitution, Info);
+ CodeSynthesisContext::DeducedTemplateArgumentSubstitution);
if (Inst.isInvalid())
return TemplateDeductionResult::InstantiationDepth;
@@ -4030,18 +4016,9 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
// If the template argument list is owned by the function template
// specialization, release it.
if (Specialization->getTemplateSpecializationArgs() ==
- CanonicalDeducedArgumentList &&
- !Trap.hasErrorOccurred())
+ CanonicalDeducedArgumentList)
Info.takeCanonical();
- // There may have been an error that did not prevent us from constructing a
- // declaration. Mark the declaration invalid and return with a substitution
- // failure.
- if (Trap.hasErrorOccurred()) {
- Specialization->setInvalidDecl(true);
- return TemplateDeductionResult::SubstitutionFailure;
- }
-
// C++2a [temp.deduct]p5
// [...] When all template arguments have been deduced [...] all uses of
// template parameters [...] are replaced with the corresponding deduced
@@ -4553,6 +4530,10 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
return TemplateDeductionResult::TooManyArguments;
}
+ EnterExpressionEvaluationContext Unevaluated(
+ *this, Sema::ExpressionEvaluationContext::Unevaluated);
+ Sema::SFINAETrap Trap(*this, Info);
+
// The types of the parameters from which we will perform template argument
// deduction.
LocalInstantiationScope InstScope(*this);
@@ -4570,6 +4551,8 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
});
if (Result != TemplateDeductionResult::Success)
return Result;
+ if (Trap.hasErrorOccurred())
+ return TemplateDeductionResult::SubstitutionFailure;
NumExplicitlySpecified = Deduced.size();
} else {
@@ -4743,6 +4726,11 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
OnlyInitializeNonUserDefinedConversions);
});
});
+ if (Trap.hasErrorOccurred()) {
+ if (Specialization)
+ Specialization->setInvalidDecl(true);
+ return TemplateDeductionResult::SubstitutionFailure;
+ }
return Result;
}
@@ -4795,6 +4783,14 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
= FunctionTemplate->getTemplateParameters();
QualType FunctionType = Function->getType();
+ bool PotentiallyEvaluated =
+ currentEvaluationContext().isPotentiallyEvaluated();
+
+ // Unevaluated SFINAE context.
+ EnterExpressionEvaluationContext Unevaluated(
+ *this, Sema::ExpressionEvaluationContext::Unevaluated);
+ SFINAETrap Trap(*this, Info);
+
// Substitute any explicit template arguments.
LocalInstantiationScope InstScope(*this);
SmallVector<DeducedTemplateArgument, 4> Deduced;
@@ -4809,6 +4805,8 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
});
if (Result != TemplateDeductionResult::Success)
return Result;
+ if (Trap.hasErrorOccurred())
+ return TemplateDeductionResult::SubstitutionFailure;
NumExplicitlySpecified = Deduced.size();
}
@@ -4820,11 +4818,6 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
ArgFunctionType = adjustCCAndNoReturn(ArgFunctionType, FunctionType,
/*AdjustExceptionSpec*/false);
- // Unevaluated SFINAE context.
- std::optional<EnterExpressionEvaluationContext> Unevaluated(
- std::in_place, *this, Sema::ExpressionEvaluationContext::Unevaluated);
- SFINAETrap Trap(*this);
-
Deduced.resize(TemplateParams->size());
// If the function has a deduced return type, substitute it for a dependent
@@ -4865,14 +4858,12 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
DeduceReturnType(Specialization, Info.getLocation(), false))
return TemplateDeductionResult::MiscellaneousDeductionFailure;
- Unevaluated = std::nullopt;
// [C++26][expr.const]/p17
// An expression or conversion is immediate-escalating if it is not initially
// in an immediate function context and it is [...]
// a potentially-evaluated id-expression that denotes an immediate function.
if (IsAddressOfFunction && getLangOpts().CPlusPlus20 &&
- Specialization->isImmediateEscalating() &&
- currentEvaluationContext().isPotentiallyEvaluated() &&
+ Specialization->isImmediateEscalating() && PotentiallyEvaluated &&
CheckIfFunctionSpecializationIsImmediate(Specialization,
Info.getLocation()))
return TemplateDeductionResult::MiscellaneousDeductionFailure;
@@ -4975,7 +4966,7 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
// Unevaluated SFINAE context.
EnterExpressionEvaluationContext Unevaluated(
*this, Sema::ExpressionEvaluationContext::Unevaluated);
- SFINAETrap Trap(*this);
+ SFINAETrap Trap(*this, Info);
// C++ [temp.deduct.conv]p1:
// Template argument deduction is done by comparing the return
@@ -5614,10 +5605,6 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
Sema &S, FunctionTemplateDecl *FTD,
SmallVectorImpl<DeducedTemplateArgument> &Deduced,
TemplateDeductionInfo &Info, T &&CheckDeductionConsistency) {
- EnterExpressionEvaluationContext Unevaluated(
- S, Sema::ExpressionEvaluationContext::Unevaluated);
- Sema::SFINAETrap Trap(S);
-
Sema::ContextRAII SavedContext(S, getAsDeclContextOrEnclosing(FTD));
// C++26 [temp.deduct.type]p2:
@@ -5645,13 +5632,7 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
// and verify that the instantiated argument is both valid
// and equivalent to the parameter.
LocalInstantiationScope InstScope(S);
-
- if (auto TDR = CheckDeductionConsistency(S, FTD, CTAI.SugaredConverted);
- TDR != TemplateDeductionResult::Success)
- return TDR;
-
- return Trap.hasErrorOccurred() ? TemplateDeductionResult::SubstitutionFailure
- : TemplateDeductionResult::Success;
+ return CheckDeductionConsistency(S, FTD, CTAI.SugaredConverted);
}
/// Determine whether the function template \p FT1 is at least as
@@ -5717,9 +5698,12 @@ static bool isAtLeastAsSpecializedAs(
}
SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(), Deduced.end());
+ EnterExpressionEvaluationContext Unevaluated(
+ S, Sema::ExpressionEvaluationContext::Unevaluated);
+ Sema::SFINAETrap Trap(S, Info);
Sema::InstantiatingTemplate Inst(
S, Info.getLocation(), FT2, DeducedArgs,
- Sema::CodeSynthesisContext::DeducedTemplateArgumentSubstitution, Info);
+ Sema::CodeSynthesisContext::DeducedTemplateArgumentSubstitution);
if (Inst.isInvalid())
return false;
@@ -5765,7 +5749,7 @@ static bool isAtLeastAsSpecializedAs(
});
}) == TemplateDeductionResult::Success;
});
- if (!AtLeastAsSpecialized)
+ if (!AtLeastAsSpecialized || Trap.hasErrorOccurred())
return false;
// C++0x [temp.deduct.partial]p11:
@@ -6241,10 +6225,11 @@ static bool isAtLeastAsSpecializedAs(Sema &S, QualType T1, QualType T2,
/*HasDeducedAnyParam=*/nullptr) != TemplateDeductionResult::Success)
return false;
- SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(),
- Deduced.end());
- Sema::InstantiatingTemplate Inst(S, Info.getLocation(), P2, DeducedArgs,
- Info);
+ SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(), Deduced.end());
+ EnterExpressionEvaluationContext Unevaluated(
+ S, Sema::ExpressionEvaluationContext::Unevaluated);
+ Sema::SFINAETrap Trap(S, Info);
+ Sema::InstantiatingTemplate Inst(S, Info.getLocation(), P2, DeducedArgs);
if (Inst.isInvalid())
return false;
@@ -6252,8 +6237,6 @@ static bool isAtLeastAsSpecializedAs(Sema &S, QualType T1, QualType T2,
Ps = cast<TemplateSpecializationType>(T2)->template_arguments(),
As = cast<TemplateSpecializationType>(T1)->template_arguments();
- Sema::SFINAETrap Trap(S);
-
TemplateDeductionResult Result;
S.runWithSufficientStackSpace(Info.getLocation(), [&] {
Result = ::FinishTemplateArgumentDeduction(
@@ -6261,14 +6244,7 @@ static bool isAtLeastAsSpecializedAs(Sema &S, QualType T1, QualType T2,
/*IsPartialOrdering=*/true, Ps, As, Deduced, Info,
/*CopyDeducedArgs=*/false);
});
-
- if (Result != TemplateDeductionResult::Success)
- return false;
-
- if (Trap.hasErrorOccurred())
- return false;
-
- return true;
+ return Result == TemplateDeductionResult::Success && !Trap.hasErrorOccurred();
}
namespace {
diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index 40811d4..bfb1066 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -1025,6 +1025,7 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
TypeAliasTemplateDecl *AliasTemplate,
FunctionTemplateDecl *F, SourceLocation Loc) {
LocalInstantiationScope Scope(SemaRef);
+ Sema::NonSFINAEContext _1(SemaRef);
Sema::InstantiatingTemplate BuildingDeductionGuides(
SemaRef, AliasTemplate->getLocation(), F,
Sema::InstantiatingTemplate::BuildingDeductionGuidesTag{});
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 5fceacd..35205f4 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -606,8 +606,7 @@ bool Sema::CodeSynthesisContext::isInstantiationRecord() const {
Sema::InstantiatingTemplate::InstantiatingTemplate(
Sema &SemaRef, CodeSynthesisContext::SynthesisKind Kind,
SourceLocation PointOfInstantiation, SourceRange InstantiationRange,
- Decl *Entity, NamedDecl *Template, ArrayRef<TemplateArgument> TemplateArgs,
- sema::TemplateDeductionInfo *DeductionInfo)
+ Decl *Entity, NamedDecl *Template, ArrayRef<TemplateArgument> TemplateArgs)
: SemaRef(SemaRef) {
// Don't allow further instantiation if a fatal error and an uncompilable
// error have occurred. Any diagnostics we might have raised will not be
@@ -625,7 +624,6 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
Inst.Template = Template;
Inst.TemplateArgs = TemplateArgs.data();
Inst.NumTemplateArgs = TemplateArgs.size();
- Inst.DeductionInfo = DeductionInfo;
Inst.InstantiationRange = InstantiationRange;
Inst.InConstraintSubstitution =
Inst.Kind == CodeSynthesisContext::ConstraintSubstitution;
@@ -671,48 +669,40 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
Sema &SemaRef, SourceLocation PointOfInstantiation,
FunctionTemplateDecl *FunctionTemplate,
ArrayRef<TemplateArgument> TemplateArgs,
- CodeSynthesisContext::SynthesisKind Kind,
- sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange)
+ CodeSynthesisContext::SynthesisKind Kind, SourceRange InstantiationRange)
: InstantiatingTemplate(SemaRef, Kind, PointOfInstantiation,
InstantiationRange, FunctionTemplate, nullptr,
- TemplateArgs, &DeductionInfo) {
+ TemplateArgs) {
assert(Kind == CodeSynthesisContext::ExplicitTemplateArgumentSubstitution ||
Kind == CodeSynthesisContext::DeducedTemplateArgumentSubstitution ||
Kind == CodeSynthesisContext::BuildingDeductionGuides);
}
Sema::InstantiatingTemplate::InstantiatingTemplate(
- Sema &SemaRef, SourceLocation PointOfInstantiation,
- TemplateDecl *Template,
- ArrayRef<TemplateArgument> TemplateArgs,
- sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange)
+ Sema &SemaRef, SourceLocation PointOfInstantiation, TemplateDecl *Template,
+ ArrayRef<TemplateArgument> TemplateArgs, SourceRange InstantiationRange)
: InstantiatingTemplate(
- SemaRef,
- CodeSynthesisContext::DeducedTemplateArgumentSubstitution,
+ SemaRef, CodeSynthesisContext::DeducedTemplateArgumentSubstitution,
PointOfInstantiation, InstantiationRange, Template, nullptr,
- TemplateArgs, &DeductionInfo) {}
+ TemplateArgs) {}
Sema::InstantiatingTemplate::InstantiatingTemplate(
Sema &SemaRef, SourceLocation PointOfInstantiation,
ClassTemplatePartialSpecializationDecl *PartialSpec,
- ArrayRef<TemplateArgument> TemplateArgs,
- sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange)
+ ArrayRef<TemplateArgument> TemplateArgs, SourceRange InstantiationRange)
: InstantiatingTemplate(
- SemaRef,
- CodeSynthesisContext::DeducedTemplateArgumentSubstitution,
+ SemaRef, CodeSynthesisContext::DeducedTemplateArgumentSubstitution,
PointOfInstantiation, InstantiationRange, PartialSpec, nullptr,
- TemplateArgs, &DeductionInfo) {}
+ TemplateArgs) {}
Sema::InstantiatingTemplate::InstantiatingTemplate(
Sema &SemaRef, SourceLocation PointOfInstantiation,
VarTemplatePartialSpecializationDecl *PartialSpec,
- ArrayRef<TemplateArgument> TemplateArgs,
- sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange)
+ ArrayRef<TemplateArgument> TemplateArgs, SourceRange InstantiationRange)
: InstantiatingTemplate(
- SemaRef,
- CodeSynthesisContext::DeducedTemplateArgumentSubstitution,
+ SemaRef, CodeSynthesisContext::DeducedTemplateArgumentSubstitution,
PointOfInstantiation, InstantiationRange, PartialSpec, nullptr,
- TemplateArgs, &DeductionInfo) {}
+ TemplateArgs) {}
Sema::InstantiatingTemplate::InstantiatingTemplate(
Sema &SemaRef, SourceLocation PointOfInstantiation, ParmVarDecl *Param,
@@ -763,12 +753,11 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
Sema::InstantiatingTemplate::InstantiatingTemplate(
Sema &SemaRef, SourceLocation PointOfInstantiation,
- concepts::Requirement *Req, sema::TemplateDeductionInfo &DeductionInfo,
- SourceRange InstantiationRange)
+ concepts::Requirement *Req, SourceRange InstantiationRange)
: InstantiatingTemplate(
SemaRef, CodeSynthesisContext::RequirementInstantiation,
PointOfInstantiation, InstantiationRange, /*Entity=*/nullptr,
- /*Template=*/nullptr, /*TemplateArgs=*/{}, &DeductionInfo) {}
+ /*Template=*/nullptr, /*TemplateArgs=*/{}) {}
Sema::InstantiatingTemplate::InstantiatingTemplate(
Sema &SemaRef, SourceLocation PointOfInstantiation,
@@ -781,11 +770,11 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
Sema::InstantiatingTemplate::InstantiatingTemplate(
Sema &SemaRef, SourceLocation PointOfInstantiation, const RequiresExpr *RE,
- sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange)
+ SourceRange InstantiationRange)
: InstantiatingTemplate(
SemaRef, CodeSynthesisContext::RequirementParameterInstantiation,
PointOfInstantiation, InstantiationRange, /*Entity=*/nullptr,
- /*Template=*/nullptr, /*TemplateArgs=*/{}, &DeductionInfo) {}
+ /*Template=*/nullptr, /*TemplateArgs=*/{}) {}
Sema::InstantiatingTemplate::InstantiatingTemplate(
Sema &SemaRef, SourceLocation PointOfInstantiation,
@@ -797,13 +786,11 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
TemplateArgs) {}
Sema::InstantiatingTemplate::InstantiatingTemplate(
- Sema &SemaRef, SourceLocation PointOfInstantiation,
- ConstraintSubstitution, NamedDecl *Template,
- sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange)
+ Sema &SemaRef, SourceLocation PointOfInstantiation, ConstraintSubstitution,
+ NamedDecl *Template, SourceRange InstantiationRange)
: InstantiatingTemplate(
SemaRef, CodeSynthesisContext::ConstraintSubstitution,
- PointOfInstantiation, InstantiationRange, Template, nullptr,
- {}, &DeductionInfo) {}
+ PointOfInstantiation, InstantiationRange, Template, nullptr, {}) {}
Sema::InstantiatingTemplate::InstantiatingTemplate(
Sema &SemaRef, SourceLocation PointOfInstantiation,
@@ -835,9 +822,6 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
ArgLoc, InstantiationRange, PArg) {}
bool Sema::pushCodeSynthesisContext(CodeSynthesisContext Ctx) {
- Ctx.SavedInNonInstantiationSFINAEContext = InNonInstantiationSFINAEContext;
- InNonInstantiationSFINAEContext = false;
-
if (!Ctx.isInstantiationRecord()) {
++NonInstantiationEntries;
} else {
@@ -871,8 +855,6 @@ void Sema::popCodeSynthesisContext() {
--NonInstantiationEntries;
}
- InNonInstantiationSFINAEContext = Active.SavedInNonInstantiationSFINAEContext;
-
// Name lookup no longer looks in this template's defining module.
assert(CodeSynthesisContexts.size() >=
CodeSynthesisContextLookupModules.size() &&
@@ -1282,93 +1264,6 @@ void Sema::PrintInstantiationStack(InstantiationContextDiagFuncRef DiagFunc) {
}
}
-std::optional<TemplateDeductionInfo *> Sema::isSFINAEContext() const {
- if (InNonInstantiationSFINAEContext)
- return std::optional<TemplateDeductionInfo *>(nullptr);
-
- for (SmallVectorImpl<CodeSynthesisContext>::const_reverse_iterator
- Active = CodeSynthesisContexts.rbegin(),
- ActiveEnd = CodeSynthesisContexts.rend();
- Active != ActiveEnd;
- ++Active)
- {
- switch (Active->Kind) {
- case CodeSynthesisContext::TypeAliasTemplateInstantiation:
- // An instantiation of an alias template may or may not be a SFINAE
- // context, depending on what else is on the stack.
- if (isa<TypeAliasTemplateDecl>(Active->Entity))
- break;
- [[fallthrough]];
- case CodeSynthesisContext::TemplateInstantiation:
- case CodeSynthesisContext::DefaultFunctionArgumentInstantiation:
- case CodeSynthesisContext::ExceptionSpecInstantiation:
- case CodeSynthesisContext::ConstraintsCheck:
- case CodeSynthesisContext::ParameterMappingSubstitution:
- case CodeSynthesisContext::ConstraintNormalization:
- case CodeSynthesisContext::NestedRequirementConstraintsCheck:
- // This is a template instantiation, so there is no SFINAE.
- return std::nullopt;
- case CodeSynthesisContext::LambdaExpressionSubstitution:
- // [temp.deduct]p9
- // A lambda-expression appearing in a function type or a template
- // parameter is not considered part of the immediate context for the
- // purposes of template argument deduction.
- // CWG2672: A lambda-expression body is never in the immediate context.
- return std::nullopt;
-
- case CodeSynthesisContext::DefaultTemplateArgumentInstantiation:
- case CodeSynthesisContext::PriorTemplateArgumentSubstitution:
- case CodeSynthesisContext::DefaultTemplateArgumentChecking:
- case CodeSynthesisContext::RewritingOperatorAsSpaceship:
- case CodeSynthesisContext::PartialOrderingTTP:
- // A default template argument instantiation and substitution into
- // template parameters with arguments for prior parameters may or may
- // not be a SFINAE context; look further up the stack.
- break;
-
- case CodeSynthesisContext::ExplicitTemplateArgumentSubstitution:
- case CodeSynthesisContext::DeducedTemplateArgumentSubstitution:
- // We're either substituting explicitly-specified template arguments,
- // deduced template arguments. SFINAE applies unless we are in a lambda
- // body, see [temp.deduct]p9.
- case CodeSynthesisContext::ConstraintSubstitution:
- case CodeSynthesisContext::RequirementInstantiation:
- case CodeSynthesisContext::RequirementParameterInstantiation:
- // SFINAE always applies in a constraint expression or a requirement
- // in a requires expression.
- assert(Active->DeductionInfo && "Missing deduction info pointer");
- return Active->DeductionInfo;
-
- case CodeSynthesisContext::DeclaringSpecialMember:
- case CodeSynthesisContext::DeclaringImplicitEqualityComparison:
- case CodeSynthesisContext::DefiningSynthesizedFunction:
- case CodeSynthesisContext::InitializingStructuredBinding:
- case CodeSynthesisContext::MarkingClassDllexported:
- case CodeSynthesisContext::BuildingBuiltinDumpStructCall:
- case CodeSynthesisContext::BuildingDeductionGuides:
- // This happens in a context unrelated to template instantiation, so
- // there is no SFINAE.
- return std::nullopt;
-
- case CodeSynthesisContext::ExceptionSpecEvaluation:
- // FIXME: This should not be treated as a SFINAE context, because
- // we will cache an incorrect exception specification. However, clang
- // bootstrap relies this! See PR31692.
- break;
-
- case CodeSynthesisContext::Memoization:
- break;
- }
-
- // The inner context was transparent for SFINAE. If it occurred within a
- // non-instantiation SFINAE context, then SFINAE applies.
- if (Active->SavedInNonInstantiationSFINAEContext)
- return std::optional<TemplateDeductionInfo *>(nullptr);
- }
-
- return std::nullopt;
-}
-
//===----------------------------------------------------------------------===/
// Template Instantiation for Types
//===----------------------------------------------------------------------===/
@@ -2674,10 +2569,9 @@ ExprResult TemplateInstantiator::TransformRequiresTypeParams(
Sema::ExtParameterInfoBuilder &PInfos) {
TemplateDeductionInfo Info(KWLoc);
- Sema::InstantiatingTemplate TypeInst(SemaRef, KWLoc,
- RE, Info,
+ Sema::InstantiatingTemplate TypeInst(SemaRef, KWLoc, RE,
SourceRange{KWLoc, RBraceLoc});
- Sema::SFINAETrap Trap(SemaRef);
+ Sema::SFINAETrap Trap(SemaRef, Info);
unsigned ErrorIdx;
if (getDerived().TransformFunctionTypeParams(
@@ -2709,10 +2603,10 @@ TemplateInstantiator::TransformTypeRequirement(concepts::TypeRequirement *Req) {
return Req;
}
- Sema::SFINAETrap Trap(SemaRef);
TemplateDeductionInfo Info(Req->getType()->getTypeLoc().getBeginLoc());
- Sema::InstantiatingTemplate TypeInst(SemaRef,
- Req->getType()->getTypeLoc().getBeginLoc(), Req, Info,
+ Sema::SFINAETrap Trap(SemaRef, Info);
+ Sema::InstantiatingTemplate TypeInst(
+ SemaRef, Req->getType()->getTypeLoc().getBeginLoc(), Req,
Req->getType()->getTypeLoc().getSourceRange());
if (TypeInst.isInvalid())
return nullptr;
@@ -2730,8 +2624,6 @@ TemplateInstantiator::TransformExprRequirement(concepts::ExprRequirement *Req) {
if (!Req->isDependent() && !AlwaysRebuild())
return Req;
- Sema::SFINAETrap Trap(SemaRef);
-
llvm::PointerUnion<Expr *, concepts::Requirement::SubstitutionDiagnostic *>
TransExpr;
if (Req->isExprSubstitutionFailure())
@@ -2739,7 +2631,8 @@ TemplateInstantiator::TransformExprRequirement(concepts::ExprRequirement *Req) {
else {
Expr *E = Req->getExpr();
TemplateDeductionInfo Info(E->getBeginLoc());
- Sema::InstantiatingTemplate ExprInst(SemaRef, E->getBeginLoc(), Req, Info,
+ Sema::SFINAETrap Trap(SemaRef, Info);
+ Sema::InstantiatingTemplate ExprInst(SemaRef, E->getBeginLoc(), Req,
E->getSourceRange());
if (ExprInst.isInvalid())
return nullptr;
@@ -2765,8 +2658,9 @@ TemplateInstantiator::TransformExprRequirement(concepts::ExprRequirement *Req) {
TemplateParameterList *OrigTPL =
RetReq.getTypeConstraintTemplateParameterList();
TemplateDeductionInfo Info(OrigTPL->getTemplateLoc());
- Sema::InstantiatingTemplate TPLInst(SemaRef, OrigTPL->getTemplateLoc(),
- Req, Info, OrigTPL->getSourceRange());
+ Sema::SFINAETrap Trap(SemaRef, Info);
+ Sema::InstantiatingTemplate TPLInst(SemaRef, OrigTPL->getTemplateLoc(), Req,
+ OrigTPL->getSourceRange());
if (TPLInst.isInvalid())
return nullptr;
TemplateParameterList *TPL = TransformTemplateParameterList(OrigTPL);
@@ -2830,11 +2724,9 @@ TemplateInstantiator::TransformNestedRequirement(
bool Success;
Expr *NewConstraint;
- TemplateDeductionInfo Info(Constraint->getBeginLoc());
{
EnterExpressionEvaluationContext ContextRAII(
SemaRef, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-
Sema::InstantiatingTemplate ConstrInst(
SemaRef, Constraint->getBeginLoc(), Req,
Sema::InstantiatingTemplate::ConstraintsCheck(),
@@ -2843,16 +2735,10 @@ TemplateInstantiator::TransformNestedRequirement(
if (ConstrInst.isInvalid())
return nullptr;
- Sema::SFINAETrap Trap(SemaRef);
-
Success = !SemaRef.CheckConstraintSatisfaction(
Req, AssociatedConstraint(Constraint, SemaRef.ArgPackSubstIndex),
TemplateArgs, Constraint->getSourceRange(), Satisfaction,
/*TopLevelConceptId=*/nullptr, &NewConstraint);
-
- assert((!Success || !Trap.hasErrorOccurred()) &&
- "Substitution failures must be handled "
- "by CheckConstraintSatisfaction.");
}
if (!Success || Satisfaction.HasSubstitutionFailure())
@@ -3306,7 +3192,7 @@ bool Sema::SubstDefaultArgument(
EnterExpressionEvaluationContext EvalContext(
*this, ExpressionEvaluationContext::PotentiallyEvaluated, Param);
-
+ NonSFINAEContext _(*this);
InstantiatingTemplate Inst(*this, Loc, Param, TemplateArgs.getInnermost());
if (Inst.isInvalid())
return true;
@@ -3594,6 +3480,7 @@ bool Sema::InstantiateClassImpl(
Spec->setPointOfInstantiation(PointOfInstantiation);
}
+ NonSFINAEContext _(*this);
InstantiatingTemplate Inst(*this, PointOfInstantiation, Instantiation);
if (Inst.isInvalid())
return true;
@@ -3828,6 +3715,7 @@ bool Sema::InstantiateEnum(SourceLocation PointOfInstantiation,
MSInfo->setPointOfInstantiation(PointOfInstantiation);
}
+ NonSFINAEContext _(*this);
InstantiatingTemplate Inst(*this, PointOfInstantiation, Instantiation);
if (Inst.isInvalid())
return true;
@@ -3892,6 +3780,7 @@ bool Sema::InstantiateInClassInitializer(
return true;
}
+ NonSFINAEContext _(*this);
InstantiatingTemplate Inst(*this, PointOfInstantiation, Instantiation);
if (Inst.isInvalid())
return true;
@@ -3975,6 +3864,7 @@ static ActionResult<CXXRecordDecl *> getPatternForClassTemplateSpecialization(
Sema &S, SourceLocation PointOfInstantiation,
ClassTemplateSpecializationDecl *ClassTemplateSpec,
TemplateSpecializationKind TSK, bool PrimaryStrictPackMatch) {
+ std::optional<Sema::NonSFINAEContext> NSC(S);
Sema::InstantiatingTemplate Inst(S, PointOfInstantiation, ClassTemplateSpec);
if (Inst.isInvalid())
return {/*Invalid=*/true};
@@ -4076,6 +3966,7 @@ static ActionResult<CXXRecordDecl *> getPatternForClassTemplateSpecialization(
if (Ambiguous) {
// Partial ordering did not produce a clear winner. Complain.
Inst.Clear();
+ NSC.reset();
S.Diag(PointOfInstantiation,
diag::err_partial_spec_ordering_ambiguous)
<< ClassTemplateSpec;
@@ -4507,6 +4398,7 @@ ExprResult Sema::SubstConceptTemplateArguments(
TemplateArgumentListInfo SubstArgs(ArgsAsWritten->getLAngleLoc(),
ArgsAsWritten->getRAngleLoc());
+ NonSFINAEContext _(*this);
Sema::InstantiatingTemplate Inst(
*this, ArgsAsWritten->arguments().front().getSourceRange().getBegin(),
Sema::InstantiatingTemplate::ConstraintNormalization{},
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 681bfe0..4d58f00 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -5316,6 +5316,7 @@ void Sema::InstantiateExceptionSpec(SourceLocation PointOfInstantiation,
return;
}
+ NonSFINAEContext _(*this);
InstantiatingTemplate Inst(*this, PointOfInstantiation, Decl,
InstantiatingTemplate::ExceptionSpecification());
if (Inst.isInvalid()) {
@@ -5383,6 +5384,7 @@ TemplateDeclInstantiator::InitFunctionInstantiation(FunctionDecl *New,
if (ActiveInst.Kind == ActiveInstType::ExplicitTemplateArgumentSubstitution ||
ActiveInst.Kind == ActiveInstType::DeducedTemplateArgumentSubstitution) {
if (isa<FunctionTemplateDecl>(ActiveInst.Entity)) {
+ SemaRef.CurrentSFINAEContext = nullptr;
atTemplateEnd(SemaRef.TemplateInstCallbacks, SemaRef, ActiveInst);
ActiveInst.Kind = ActiveInstType::TemplateInstantiation;
ActiveInst.Entity = New;
@@ -5493,8 +5495,7 @@ FunctionDecl *Sema::InstantiateFunctionDeclaration(
SourceLocation Loc, CodeSynthesisContext::SynthesisKind CSC) {
FunctionDecl *FD = FTD->getTemplatedDecl();
- sema::TemplateDeductionInfo Info(Loc);
- InstantiatingTemplate Inst(*this, Loc, FTD, Args->asArray(), CSC, Info);
+ InstantiatingTemplate Inst(*this, Loc, FTD, Args->asArray(), CSC);
if (Inst.isInvalid())
return nullptr;
@@ -5684,6 +5685,7 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
}
}
+ NonSFINAEContext _(*this);
InstantiatingTemplate Inst(*this, PointOfInstantiation, Function);
if (Inst.isInvalid())
return;
@@ -5974,6 +5976,7 @@ VarTemplateSpecializationDecl *Sema::BuildVarTemplateInstantiation(
if (FromVar->isInvalidDecl())
return nullptr;
+ NonSFINAEContext _(*this);
InstantiatingTemplate Inst(*this, PointOfInstantiation, FromVar);
if (Inst.isInvalid())
return nullptr;
@@ -6281,6 +6284,7 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
!Var->hasInit()) {
// FIXME: Factor out the duplicated instantiation context setup/tear down
// code here.
+ NonSFINAEContext _(*this);
InstantiatingTemplate Inst(*this, PointOfInstantiation, Var);
if (Inst.isInvalid())
return;
@@ -6385,6 +6389,7 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
return;
}
+ NonSFINAEContext _(*this);
InstantiatingTemplate Inst(*this, PointOfInstantiation, Var);
if (Inst.isInvalid())
return;
diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp
index 0f72d6a..5b1aad3 100644
--- a/clang/lib/Sema/SemaTemplateVariadic.cpp
+++ b/clang/lib/Sema/SemaTemplateVariadic.cpp
@@ -844,7 +844,7 @@ bool Sema::CheckParameterPacksForExpansion(
ArrayRef<UnexpandedParameterPack> Unexpanded,
const MultiLevelTemplateArgumentList &TemplateArgs,
bool FailOnPackProducingTemplates, bool &ShouldExpand,
- bool &RetainExpansion, UnsignedOrNone &NumExpansions) {
+ bool &RetainExpansion, UnsignedOrNone &NumExpansions, bool Diagnose) {
ShouldExpand = true;
RetainExpansion = false;
IdentifierLoc FirstPack;
@@ -874,6 +874,9 @@ bool Sema::CheckParameterPacksForExpansion(
if (!FailOnPackProducingTemplates)
continue;
+ if (!Diagnose)
+ return true;
+
// It is not yet supported in certain contexts.
return Diag(PatternRange.getBegin().isValid() ? PatternRange.getBegin()
: EllipsisLoc,
@@ -1015,7 +1018,9 @@ bool Sema::CheckParameterPacksForExpansion(
// C++0x [temp.variadic]p5:
// All of the parameter packs expanded by a pack expansion shall have
// the same number of arguments specified.
- if (HaveFirstPack)
+ if (!Diagnose)
+ ;
+ else if (HaveFirstPack)
Diag(EllipsisLoc, diag::err_pack_expansion_length_conflict)
<< FirstPack.getIdentifierInfo() << Name << *NumExpansions
<< (LeastNewPackSize != NewPackSize) << LeastNewPackSize
@@ -1041,6 +1046,8 @@ bool Sema::CheckParameterPacksForExpansion(
if (NumExpansions && *NumExpansions < *NumPartialExpansions) {
NamedDecl *PartialPack =
CurrentInstantiationScope->getPartiallySubstitutedPack();
+ if (!Diagnose)
+ return true;
Diag(EllipsisLoc, diag::err_pack_expansion_length_conflict_partial)
<< PartialPack << *NumPartialExpansions << *NumExpansions
<< SourceRange(PartiallySubstitutedPackLoc);
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index dffd7c1..de210c4 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -15824,16 +15824,20 @@ TreeTransform<Derived>::TransformLambdaExpr(LambdaExpr *E) {
Sema::ExpressionEvaluationContext::PotentiallyEvaluated,
E->getCallOperator());
- Sema::CodeSynthesisContext C;
- C.Kind = clang::Sema::CodeSynthesisContext::LambdaExpressionSubstitution;
- C.PointOfInstantiation = E->getBody()->getBeginLoc();
- getSema().pushCodeSynthesisContext(C);
+ StmtResult Body;
+ {
+ Sema::NonSFINAEContext _(getSema());
+ Sema::CodeSynthesisContext C;
+ C.Kind = clang::Sema::CodeSynthesisContext::LambdaExpressionSubstitution;
+ C.PointOfInstantiation = E->getBody()->getBeginLoc();
+ getSema().pushCodeSynthesisContext(C);
- // Instantiate the body of the lambda expression.
- StmtResult Body =
- Invalid ? StmtError() : getDerived().TransformLambdaBody(E, E->getBody());
+ // Instantiate the body of the lambda expression.
+ Body = Invalid ? StmtError()
+ : getDerived().TransformLambdaBody(E, E->getBody());
- getSema().popCodeSynthesisContext();
+ getSema().popCodeSynthesisContext();
+ }
// ActOnLambda* will pop the function scope for us.
FuncScopeCleanup.disable();
diff --git a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
index 63f0d70..0ba3c05 100644
--- a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
+++ b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
@@ -3254,9 +3254,6 @@ bool ConditionBRVisitor::printValue(const Expr *CondVarExpr, raw_ostream &Out,
return true;
}
-constexpr llvm::StringLiteral ConditionBRVisitor::GenericTrueMessage;
-constexpr llvm::StringLiteral ConditionBRVisitor::GenericFalseMessage;
-
bool ConditionBRVisitor::isPieceMessageGeneric(
const PathDiagnosticPiece *Piece) {
return Piece->getString() == GenericTrueMessage ||
diff --git a/clang/lib/Tooling/Syntax/TokenBufferTokenManager.cpp b/clang/lib/Tooling/Syntax/TokenBufferTokenManager.cpp
index a06f7e2..3d63d4a 100644
--- a/clang/lib/Tooling/Syntax/TokenBufferTokenManager.cpp
+++ b/clang/lib/Tooling/Syntax/TokenBufferTokenManager.cpp
@@ -10,8 +10,6 @@
namespace clang {
namespace syntax {
-constexpr llvm::StringLiteral syntax::TokenBufferTokenManager::Kind;
-
std::pair<FileID, ArrayRef<syntax::Token>>
syntax::TokenBufferTokenManager::lexBuffer(
std::unique_ptr<llvm::MemoryBuffer> Input) {
diff --git a/clang/test/C/C2y/n3525.c b/clang/test/C/C2y/n3525.c
new file mode 100644
index 0000000..428df23
--- /dev/null
+++ b/clang/test/C/C2y/n3525.c
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s
+// RUN: %clang_cc1 -verify -std=c23 -Wall -pedantic %s
+
+/* WG14 N3525: Yes
+ * static_assert without UB
+ *
+ * Ensures that a static_assert declaration cannot defer to runtime; it must
+ * take an integer constant expression that is resolved at compile time.
+ *
+ * Note: implementations are free to extend what is a valid integer constant
+ * expression, and Clang (and GCC) does so. So this test is validating that
+ * we quietly accept a pasing assertion, loudly reject a failing assertion, and
+ * issue a pedantic diagnostic for the extension case.
+ */
+
+static_assert(1); // Okay
+
+static_assert(0); // expected-error {{static assertion failed}}
+
+extern int a;
+static_assert(1 || a); // expected-warning {{expression is not an integer constant expression; folding it to a constant is a GNU extension}}
+
+static_assert(a); // expected-error {{static assertion expression is not an integral constant expression}}
+static_assert(0 || a); // expected-error {{static assertion expression is not an integral constant expression}}
+
+// Note, there is no CodeGen test for this; we have existing tests for the ICE
+// extension, so the pedantic warning is sufficient to verify we're not
+// emitting code which reads 'a' in '1 || a' because of the folding, and
+// there's no way to generate code for reading 'a' in '0 || a' because of the
+// error.
diff --git a/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl b/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
index 368d652..51b0f81 100644
--- a/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
@@ -1,161 +1,260 @@
// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
-// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s -DTARGET=dx
+// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN: -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s -DTARGET=dx \
+// RUN: --check-prefixes=CHECK,DXCHECK
// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
-// RUN: -emit-llvm -disable-llvm-passes \
-// RUN: -o - | FileCheck %s -DTARGET=spv
+// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN: -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s -DTARGET=spv
#ifdef __HLSL_ENABLE_16_BIT
// CHECK-LABEL: test_firstbithigh_ushort
-// CHECK: call i32 @llvm.[[TARGET]].firstbituhigh.i16
+// CHECK: [[FBH:%.*]] = tail call {{.*}}i32 @llvm.[[TARGET]].firstbituhigh.i16
+// DXCHECK-NEXT: [[SUB:%.*]] = sub i32 15, [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[FBH]], -1
+// DXCHECK-NEXT: select i1 %cmp.i, i32 -1, i32 [[SUB]]
+// CHECK-NEXT: ret i32
uint test_firstbithigh_ushort(uint16_t p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_ushort2
-// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbituhigh.v2i16
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<2 x i32> @llvm.[[TARGET]].firstbituhigh.v2i16
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> splat (i32 15), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <2 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <2 x i1> %cmp.i, <2 x i32> splat (i32 -1), <2 x i32> [[SUB]]
+// CHECK-NEXT: ret <2 x i32>
uint2 test_firstbithigh_ushort2(uint16_t2 p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_ushort3
-// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbituhigh.v3i16
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<3 x i32> @llvm.[[TARGET]].firstbituhigh.v3i16
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <3 x i32> splat (i32 15), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <3 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <3 x i1> %cmp.i, <3 x i32> splat (i32 -1), <3 x i32> [[SUB]]
+// CHECK-NEXT: ret <3 x i32>
uint3 test_firstbithigh_ushort3(uint16_t3 p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_ushort4
-// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i16
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i16
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 15), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
+// CHECK-NEXT: ret <4 x i32>
uint4 test_firstbithigh_ushort4(uint16_t4 p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_short
-// CHECK: call i32 @llvm.[[TARGET]].firstbitshigh.i16
+// CHECK: [[FBH:%.*]] = tail call {{.*}}i32 @llvm.[[TARGET]].firstbitshigh.i16
+// DXCHECK-NEXT: [[SUB:%.*]] = sub i32 15, [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[FBH]], -1
+// DXCHECK-NEXT: select i1 %cmp.i, i32 -1, i32 [[SUB]]
+// CHECK-NEXT: ret i32
uint test_firstbithigh_short(int16_t p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_short2
-// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitshigh.v2i16
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<2 x i32> @llvm.[[TARGET]].firstbitshigh.v2i16
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> splat (i32 15), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <2 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <2 x i1> %cmp.i, <2 x i32> splat (i32 -1), <2 x i32> [[SUB]]
+// CHECK-NEXT: ret <2 x i32>
uint2 test_firstbithigh_short2(int16_t2 p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_short3
-// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitshigh.v3i16
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<3 x i32> @llvm.[[TARGET]].firstbitshigh.v3i16
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <3 x i32> splat (i32 15), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <3 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <3 x i1> %cmp.i, <3 x i32> splat (i32 -1), <3 x i32> [[SUB]]
+// CHECK-NEXT: ret <3 x i32>
uint3 test_firstbithigh_short3(int16_t3 p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_short4
-// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitshigh.v4i16
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbitshigh.v4i16
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 15), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
+// CHECK-NEXT: ret <4 x i32>
uint4 test_firstbithigh_short4(int16_t4 p0) {
return firstbithigh(p0);
}
#endif // __HLSL_ENABLE_16_BIT
// CHECK-LABEL: test_firstbithigh_uint
-// CHECK: call i32 @llvm.[[TARGET]].firstbituhigh.i32
+// CHECK: [[FBH:%.*]] = tail call {{.*}}i32 @llvm.[[TARGET]].firstbituhigh.i32
+// DXCHECK-NEXT: [[SUB:%.*]] = sub i32 31, [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[FBH]], -1
+// DXCHECK-NEXT: select i1 %cmp.i, i32 -1, i32 [[SUB]]
+// CHECK-NEXT: ret i32
uint test_firstbithigh_uint(uint p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_uint2
-// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbituhigh.v2i32
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<2 x i32> @llvm.[[TARGET]].firstbituhigh.v2i32
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> splat (i32 31), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <2 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <2 x i1> %cmp.i, <2 x i32> splat (i32 -1), <2 x i32> [[SUB]]
+// CHECK-NEXT: ret <2 x i32>
uint2 test_firstbithigh_uint2(uint2 p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_uint3
-// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbituhigh.v3i32
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<3 x i32> @llvm.[[TARGET]].firstbituhigh.v3i32
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <3 x i32> splat (i32 31), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <3 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <3 x i1> %cmp.i, <3 x i32> splat (i32 -1), <3 x i32> [[SUB]]
+// CHECK-NEXT: ret <3 x i32>
uint3 test_firstbithigh_uint3(uint3 p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_uint4
-// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i32
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i32
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 31), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
+// CHECK-NEXT: ret <4 x i32>
uint4 test_firstbithigh_uint4(uint4 p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_ulong
-// CHECK: call i32 @llvm.[[TARGET]].firstbituhigh.i64
+// CHECK: [[FBH:%.*]] = tail call {{.*}}i32 @llvm.[[TARGET]].firstbituhigh.i64
+// DXCHECK-NEXT: [[SUB:%.*]] = sub i32 63, [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[FBH]], -1
+// DXCHECK-NEXT: select i1 %cmp.i, i32 -1, i32 [[SUB]]
+// CHECK-NEXT: ret i32
uint test_firstbithigh_ulong(uint64_t p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_ulong2
-// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbituhigh.v2i64
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<2 x i32> @llvm.[[TARGET]].firstbituhigh.v2i64
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> splat (i32 63), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <2 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <2 x i1> %cmp.i, <2 x i32> splat (i32 -1), <2 x i32> [[SUB]]
+// CHECK-NEXT: ret <2 x i32>
uint2 test_firstbithigh_ulong2(uint64_t2 p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_ulong3
-// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbituhigh.v3i64
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<3 x i32> @llvm.[[TARGET]].firstbituhigh.v3i64
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <3 x i32> splat (i32 63), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <3 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <3 x i1> %cmp.i, <3 x i32> splat (i32 -1), <3 x i32> [[SUB]]
+// CHECK-NEXT: ret <3 x i32>
uint3 test_firstbithigh_ulong3(uint64_t3 p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_ulong4
-// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i64
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i64
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 63), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
+// CHECK-NEXT: ret <4 x i32>
uint4 test_firstbithigh_ulong4(uint64_t4 p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_int
-// CHECK: call i32 @llvm.[[TARGET]].firstbitshigh.i32
+// CHECK: [[FBH:%.*]] = tail call {{.*}}i32 @llvm.[[TARGET]].firstbitshigh.i32
+// DXCHECK-NEXT: [[SUB:%.*]] = sub i32 31, [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[FBH]], -1
+// DXCHECK-NEXT: select i1 %cmp.i, i32 -1, i32 [[SUB]]
+// CHECK-NEXT: ret i32
uint test_firstbithigh_int(int p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_int2
-// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitshigh.v2i32
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<2 x i32> @llvm.[[TARGET]].firstbitshigh.v2i32
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> splat (i32 31), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <2 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <2 x i1> %cmp.i, <2 x i32> splat (i32 -1), <2 x i32> [[SUB]]
+// CHECK-NEXT: ret <2 x i32>
uint2 test_firstbithigh_int2(int2 p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_int3
-// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitshigh.v3i32
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<3 x i32> @llvm.[[TARGET]].firstbitshigh.v3i32
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <3 x i32> splat (i32 31), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <3 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <3 x i1> %cmp.i, <3 x i32> splat (i32 -1), <3 x i32> [[SUB]]
+// CHECK-NEXT: ret <3 x i32>
uint3 test_firstbithigh_int3(int3 p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_int4
-// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitshigh.v4i32
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbitshigh.v4i32
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 31), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
+// CHECK-NEXT: ret <4 x i32>
uint4 test_firstbithigh_int4(int4 p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_long
-// CHECK: call i32 @llvm.[[TARGET]].firstbitshigh.i64
+// CHECK: [[FBH:%.*]] = tail call {{.*}}i32 @llvm.[[TARGET]].firstbitshigh.i64
+// DXCHECK-NEXT: [[SUB:%.*]] = sub i32 63, [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[FBH]], -1
+// DXCHECK-NEXT: select i1 %cmp.i, i32 -1, i32 [[SUB]]
+// CHECK-NEXT: ret i32
uint test_firstbithigh_long(int64_t p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_long2
-// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitshigh.v2i64
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<2 x i32> @llvm.[[TARGET]].firstbitshigh.v2i64
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> splat (i32 63), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <2 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <2 x i1> %cmp.i, <2 x i32> splat (i32 -1), <2 x i32> [[SUB]]
+// CHECK-NEXT: ret <2 x i32>
uint2 test_firstbithigh_long2(int64_t2 p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_long3
-// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitshigh.v3i64
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<3 x i32> @llvm.[[TARGET]].firstbitshigh.v3i64
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <3 x i32> splat (i32 63), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <3 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <3 x i1> %cmp.i, <3 x i32> splat (i32 -1), <3 x i32> [[SUB]]
+// CHECK-NEXT: ret <3 x i32>
uint3 test_firstbithigh_long3(int64_t3 p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_long4
-// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitshigh.v4i64
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbitshigh.v4i64
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 63), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
+// CHECK-NEXT: ret <4 x i32>
uint4 test_firstbithigh_long4(int64_t4 p0) {
return firstbithigh(p0);
}
// CHECK-LABEL: test_firstbithigh_upcast
-// CHECK: [[FBH:%.*]] = call <4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i32(<4 x i32> %{{.*}})
-// CHECK: [[CONV:%.*]] = zext <4 x i32> [[FBH]] to <4 x i64>
-// CHECK: ret <4 x i64> [[CONV]]
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i32(<4 x i32> %{{.*}})
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 31), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
+// CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i32> {{.*}} to <4 x i64>
+// CHECK-NEXT: ret <4 x i64> [[ZEXT]]
uint64_t4 test_firstbithigh_upcast(uint4 p0) {
return firstbithigh(p0);
}
diff --git a/clang/test/Driver/HLSL/wconversion.hlsl b/clang/test/Driver/HLSL/wconversion.hlsl
new file mode 100644
index 0000000..1857a3d
--- /dev/null
+++ b/clang/test/Driver/HLSL/wconversion.hlsl
@@ -0,0 +1,7 @@
+// RUN: %clang_dxc -T lib_6_7 %s -### %s 2>&1 | FileCheck %s --check-prefixes=CONV
+// RUN: %clang_dxc -T lib_6_7 -Wno-conversion %s -### %s 2>&1 | FileCheck %s --check-prefixes=NOCONV
+
+// make sure we generate -Wconversion by default
+// CONV: "-Wconversion"
+// make sure -Wno-conversion still works
+// NOCONV: "-Wno-conversion"
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1250.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1250.bc
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1250.bc
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1251.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1251.bc
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1251.bc
diff --git a/clang/test/Driver/amdgpu-openmp-sanitize-options.c b/clang/test/Driver/amdgpu-openmp-sanitize-options.c
index 914e018..10d6498 100644
--- a/clang/test/Driver/amdgpu-openmp-sanitize-options.c
+++ b/clang/test/Driver/amdgpu-openmp-sanitize-options.c
@@ -22,10 +22,14 @@
// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address --rocm-path=%S/Inputs/rocm %s 2>&1 \
// RUN: | FileCheck -check-prefixes=HOSTSAN,GPUSAN,SAN %s
-// ASan enabled for multiple amdgpu-arch [gfx908:xnack+,gfx900:xnack+]
+// GPU ASan enabled for multiple amdgpu-arch [gfx908:xnack+,gfx900:xnack+]
// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ --offload-arch=gfx900:xnack+ -fsanitize=address -fgpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \
// RUN: | FileCheck -check-prefixes=HOSTSAN,GPUSAN,SAN %s
+// GPU ASan enabled for amdgpu-arch [gfx1250,gfx1251]
+// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx1250,gfx1251 -fsanitize=address -fgpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=HOSTSAN,GPUSAN,SAN %s
+
// GPU ASan Disabled Test Cases
// GPU ASan disabled through '-fsanitize=address' without '-fgpu-sanitize' flag for amdgpu-arch [gfx908]
@@ -56,9 +60,9 @@
// HOSTSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "--offload-targets=amdgcn-amd-amdhsa".* "-x" "c".*}}
-// GPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu".* "-emit-llvm-bc".* "-mlink-bitcode-file" "[^"]*asanrtl.bc".* "-mlink-bitcode-file" "[^"]*ockl.bc".* "-target-cpu" "(gfx908|gfx900)".* "-fopenmp".* "-fsanitize=address".* "-x" "c".*}}
+// GPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu".* "-emit-llvm-bc".* "-mlink-bitcode-file" "[^"]*asanrtl.bc".* "-mlink-bitcode-file" "[^"]*ockl.bc".* "-target-cpu" "(gfx908|gfx900|gfx1250|gfx1251)".* "-fopenmp".* "-fsanitize=address".* "-x" "c".*}}
// NOGPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu".* "-emit-llvm-bc".* "-target-cpu" "(gfx908|gfx900)".* "-fopenmp".* "-x" "c".*}}
-// SAN: {{"[^"]*llvm-offload-binary[^"]*" "-o".* "--image=file=.*.bc,triple=amdgcn-amd-amdhsa,arch=gfx908(:xnack\-|:xnack\+)?,kind=openmp(,feature=(\-xnack|\+xnack))?"}}
+// SAN: {{"[^"]*llvm-offload-binary[^"]*" "-o".* "--image=file=.*.bc,triple=amdgcn-amd-amdhsa,arch=(gfx908|gfx1250|gfx1251)(:xnack\-|:xnack\+)?,kind=openmp(,feature=(\-xnack|\+xnack))?"}}
// SAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "--offload-targets=amdgcn-amd-amdhsa".* "-x" "ir".*}}
// SAN: {{"[^"]*clang-linker-wrapper[^"]*".* "--host-triple=x86_64-unknown-linux-gnu".* "--linker-path=[^"]*".* "--whole-archive" "[^"]*(libclang_rt.asan_static.a|libclang_rt.asan_static-x86_64.a)".* "--whole-archive" "[^"]*(libclang_rt.asan.a|libclang_rt.asan-x86_64.a)".*}}
diff --git a/clang/test/Driver/hip-sanitize-options.hip b/clang/test/Driver/hip-sanitize-options.hip
index 0c9c15b..4903851 100644
--- a/clang/test/Driver/hip-sanitize-options.hip
+++ b/clang/test/Driver/hip-sanitize-options.hip
@@ -3,6 +3,11 @@
// RUN: -nogpuinc --rocm-path=%S/Inputs/rocm \
// RUN: %s 2>&1 | FileCheck -check-prefixes=NORDC %s
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx1250,gfx1251 \
+// RUN: -fsanitize=address \
+// RUN: -nogpuinc --rocm-path=%S/Inputs/rocm \
+// RUN: %s 2>&1 | FileCheck -check-prefixes=NORDC %s
+
// RUN: %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ \
// RUN: -fsanitize=address -fno-gpu-sanitize \
// RUN: -nogpuinc --rocm-path=%S/Inputs/rocm \
diff --git a/clang/test/Driver/rocm-device-libs.cl b/clang/test/Driver/rocm-device-libs.cl
index f9766e6..649dc85 100644
--- a/clang/test/Driver/rocm-device-libs.cl
+++ b/clang/test/Driver/rocm-device-libs.cl
@@ -139,6 +139,18 @@
// RUN: 2>&1 | FileCheck --check-prefixes=ASAN,COMMON %s
// RUN: %clang -### -target amdgcn-amd-amdhsa \
+// RUN: -x cl -mcpu=gfx1250 -fsanitize=address \
+// RUN: --rocm-path=%S/Inputs/rocm \
+// RUN: %s \
+// RUN: 2>&1 | FileCheck --check-prefixes=ASAN,COMMON %s
+
+// RUN: %clang -### -target amdgcn-amd-amdhsa \
+// RUN: -x cl -mcpu=gfx1251 -fsanitize=address \
+// RUN: --rocm-path=%S/Inputs/rocm \
+// RUN: %s \
+// RUN: 2>&1 | FileCheck --check-prefixes=ASAN,COMMON %s
+
+// RUN: %clang -### -target amdgcn-amd-amdhsa \
// RUN: -x cl -mcpu=gfx908:xnack+ \
// RUN: --rocm-path=%S/Inputs/rocm \
// RUN: %s \
diff --git a/clang/test/Parser/lambda-misplaced-capture-default.cpp b/clang/test/Parser/lambda-misplaced-capture-default.cpp
index d65b875..4f5bd6d 100644
--- a/clang/test/Parser/lambda-misplaced-capture-default.cpp
+++ b/clang/test/Parser/lambda-misplaced-capture-default.cpp
@@ -36,3 +36,12 @@ template <typename... Args> void Test(Args... args) {
[... xs = &args, &] {}; // expected-error {{capture default must be first}}
}
} // namespace misplaced_capture_default_pack
+
+namespace GH163498 {
+struct S {
+ template <class T> S(T) {}
+};
+void t() {
+ S s{[a(42), &] {}}; // expected-error {{capture default must be first}}
+}
+}
diff --git a/clang/test/Profile/Inputs/c-counter-overflows.proftext b/clang/test/Profile/Inputs/c-counter-overflows.proftext
index 4d0287c..8633060 100644
--- a/clang/test/Profile/Inputs/c-counter-overflows.proftext
+++ b/clang/test/Profile/Inputs/c-counter-overflows.proftext
@@ -1,5 +1,5 @@
main
-7779561829442898616
+862032801801816760
8
1
68719476720
diff --git a/clang/test/Profile/Inputs/c-general.profdata.v12 b/clang/test/Profile/Inputs/c-general.profdata.v12
new file mode 100644
index 0000000..57a72fa
--- /dev/null
+++ b/clang/test/Profile/Inputs/c-general.profdata.v12
Binary files differ
diff --git a/clang/test/Profile/Inputs/c-general.proftext b/clang/test/Profile/Inputs/c-general.proftext
index 08280ef..72e1be6 100644
--- a/clang/test/Profile/Inputs/c-general.proftext
+++ b/clang/test/Profile/Inputs/c-general.proftext
@@ -7,7 +7,7 @@ simple_loops
75
conditionals
-4904767535850050386
+293081517422662482
13
1
100
@@ -24,7 +24,7 @@ conditionals
1
early_exits
-2880354649761471549
+574511640547777597
9
1
0
@@ -37,7 +37,7 @@ early_exits
0
jumps
-15051420506203462683
+63440946314451995
22
1
1
@@ -86,7 +86,7 @@ switches
0
big_switch
-13144136522122330070
+461999971447013334
17
1
32
@@ -125,7 +125,7 @@ boolean_operators
33
boolop_loops
-12402604614320574815
+873389568252105055
13
1
50
@@ -149,7 +149,7 @@ conditional_operator
1
do_fallthrough
-8714614136504380050
+644163604256451218
4
1
10
diff --git a/clang/test/Profile/Inputs/c-unprofiled-blocks.proftext b/clang/test/Profile/Inputs/c-unprofiled-blocks.proftext
index d880663..7af5097 100644
--- a/clang/test/Profile/Inputs/c-unprofiled-blocks.proftext
+++ b/clang/test/Profile/Inputs/c-unprofiled-blocks.proftext
@@ -1,5 +1,5 @@
never_called
-6820425066224770721
+1055817543190535841
9
0
0
@@ -17,7 +17,7 @@ main
1
dead_code
-5254464978620792806
+642778960193404902
10
1
0
diff --git a/clang/test/Profile/Inputs/cxx-rangefor.proftext b/clang/test/Profile/Inputs/cxx-rangefor.proftext
index d41205b..cfc88da 100644
--- a/clang/test/Profile/Inputs/cxx-rangefor.proftext
+++ b/clang/test/Profile/Inputs/cxx-rangefor.proftext
@@ -1,5 +1,5 @@
_Z9range_forv
-8789831523895825398
+719380991647896566
5
1
4
diff --git a/clang/test/Profile/Inputs/cxx-throws.proftext b/clang/test/Profile/Inputs/cxx-throws.proftext
index 043dea0..92b0eab 100644
--- a/clang/test/Profile/Inputs/cxx-throws.proftext
+++ b/clang/test/Profile/Inputs/cxx-throws.proftext
@@ -1,5 +1,5 @@
_Z6throwsv
-18172607911962830854
+878785342860126214
9
1
100
diff --git a/clang/test/Profile/Inputs/misexpect-switch-default.proftext b/clang/test/Profile/Inputs/misexpect-switch-default.proftext
index 533da91..112426e 100644
--- a/clang/test/Profile/Inputs/misexpect-switch-default.proftext
+++ b/clang/test/Profile/Inputs/misexpect-switch-default.proftext
@@ -1,6 +1,6 @@
main
# Func Hash:
-8734802134600123338
+664351602352194506
# Num Counters:
9
# Counter Values:
diff --git a/clang/test/Profile/Inputs/misexpect-switch-nonconst.proftext b/clang/test/Profile/Inputs/misexpect-switch-nonconst.proftext
index 0da9379..99d067c 100644
--- a/clang/test/Profile/Inputs/misexpect-switch-nonconst.proftext
+++ b/clang/test/Profile/Inputs/misexpect-switch-nonconst.proftext
@@ -1,6 +1,6 @@
main
# Func Hash:
-3721743393642630379
+262978879822089451
# Num Counters:
10
# Counter Values:
diff --git a/clang/test/Profile/c-collision.c b/clang/test/Profile/c-collision.c
index 6c779c6..f35ba1b 100644
--- a/clang/test/Profile/c-collision.c
+++ b/clang/test/Profile/c-collision.c
@@ -2,8 +2,8 @@
// RUN: %clang_cc1 -UEXTRA -triple x86_64-unknown-linux-gnu -main-file-name c-collision.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s --check-prefix=CHECK-NOEXTRA
// RUN: %clang_cc1 -DEXTRA -triple x86_64-unknown-linux-gnu -main-file-name c-collision.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s --check-prefix=CHECK-EXTRA
-// CHECK-NOEXTRA: @__profd_foo = private global { {{.*}} } { i64 6699318081062747564, i64 7156072912471487002,
-// CHECK-EXTRA: @__profd_foo = private global { {{.*}} } { i64 6699318081062747564, i64 -4383447408116050035,
+// CHECK-NOEXTRA: @__profd_foo = private global { {{.*}} } { i64 6699318081062747564, i64 238543884830405146,
+// CHECK-EXTRA: @__profd_foo = private global { {{.*}} } { i64 6699318081062747564, i64 228238610311337869,
extern int bar;
void foo(void) {
diff --git a/clang/test/Profile/c-general.c b/clang/test/Profile/c-general.c
index ee36a43..6c865e6 100644
--- a/clang/test/Profile/c-general.c
+++ b/clang/test/Profile/c-general.c
@@ -4,6 +4,7 @@
// RUN: llvm-profdata merge %S/Inputs/c-general.proftext -o %t.profdata
// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata | FileCheck -allow-deprecated-dag-overlap -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v12 | FileCheck -allow-deprecated-dag-overlap -check-prefix=PGOUSE %s
// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v5 | FileCheck -allow-deprecated-dag-overlap -check-prefix=PGOUSE %s
// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v3 | FileCheck -allow-deprecated-dag-overlap -check-prefix=PGOUSE %s
// Also check compatibility with older profiles.
diff --git a/clang/test/SemaCXX/attr-mode-tmpl.cpp b/clang/test/SemaCXX/attr-mode-tmpl.cpp
index f665b1b..3a1da3b 100644
--- a/clang/test/SemaCXX/attr-mode-tmpl.cpp
+++ b/clang/test/SemaCXX/attr-mode-tmpl.cpp
@@ -45,7 +45,7 @@ void CheckMachineMode() {
// Check attributes on function parameters.
template <class T1, class T2>
-void CheckParameters(T1 __attribute__((mode(SI))) paramSI, // expected-note{{ignored: substitution failure}} expected-note-re{{not viable: no known conversion from '{{.*}}' (vector of 4 '{{.*}}' values) to 'EnumType' for 2nd argument}}
+void CheckParameters(T1 __attribute__((mode(SI))) paramSI, // expected-note{{ignored: substitution failure}} expected-note{{ignored: substitution failure [with T1 = int, T2 = int]: type of machine mode does not match type of base type}}
T1 __attribute__((mode(V4DI))) paramV4DI, // expected-warning{{deprecated}}
T2 __attribute__((mode(SF))) paramSF,
T2 __attribute__((mode(V4DF))) paramV4DF) { // expected-warning{{deprecated}}
diff --git a/clang/test/SemaCXX/cxx23-assume.cpp b/clang/test/SemaCXX/cxx23-assume.cpp
index ce86266..a594a1a 100644
--- a/clang/test/SemaCXX/cxx23-assume.cpp
+++ b/clang/test/SemaCXX/cxx23-assume.cpp
@@ -108,7 +108,8 @@ constexpr bool f4() {
template <typename T>
concept C = f4<T>(); // expected-note 3 {{in instantiation of}}
// expected-note@-1 3 {{while substituting}}
- // expected-error@-2 2 {{resulted in a non-constant expression}}
+ // expected-error@-2 {{resulted in a non-constant expression}}
+ // expected-note@-3 {{because substituted constraint expression is ill-formed: substitution into constraint expression resulted in a non-constant expression}}
struct D {
int x;
@@ -130,13 +131,13 @@ constexpr int f5() requires C<T> { return 1; } // expected-note {{while checking
// expected-note@-1 {{candidate template ignored}}
template <typename T>
-constexpr int f5() requires (!C<T>) { return 2; } // expected-note 4 {{while checking the satisfaction}} \
- // expected-note 4 {{while substituting template arguments}} \
+constexpr int f5() requires (!C<T>) { return 2; } // expected-note 3 {{while checking the satisfaction}} \
+ // expected-note 3 {{while substituting template arguments}} \
// expected-note {{candidate template ignored}}
static_assert(f5<int>() == 1);
-static_assert(f5<D>() == 1); // expected-note 3 {{while checking constraint satisfaction}}
- // expected-note@-1 3 {{while substituting deduced template arguments}}
+static_assert(f5<D>() == 1); // expected-note 2 {{while checking constraint satisfaction}}
+ // expected-note@-1 2 {{while substituting deduced template arguments}}
// expected-error@-2 {{no matching function for call}}
static_assert(f5<double>() == 2);
@@ -170,7 +171,7 @@ foo (int x, int y)
// Do not crash when assumptions are unreachable.
namespace gh106898 {
-int foo () {
+int foo () {
while(1);
int a = 0, b = 1;
__attribute__((assume (a < b)));
diff --git a/clang/test/SemaCXX/cxx2b-warn-shadow.cpp b/clang/test/SemaCXX/cxx2b-warn-shadow.cpp
index 76866c4..9ce0c5a 100644
--- a/clang/test/SemaCXX/cxx2b-warn-shadow.cpp
+++ b/clang/test/SemaCXX/cxx2b-warn-shadow.cpp
@@ -11,3 +11,29 @@ struct Foo {
}
};
} // namespace GH95707
+
+namespace GH163731 {
+struct S1 {
+ int a;
+ void m(this S1 &self) {
+ auto lambda = [](int a) { return a; };
+ }
+};
+
+struct S2 {
+ int a;
+ void m(this S2 &self) {
+ int a = 1; // expected-note {{previous declaration is here}}
+ auto lambda = [](int a) { // expected-warning {{declaration shadows a local variable}}
+ return a;
+ };
+ }
+};
+
+struct S3 {
+ int a;
+ void m(this S3 &self) {
+ auto lambda = [self](int a) { return a + self.a; };
+ }
+};
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl
index f99e606..1f70186 100644
--- a/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl
@@ -12,7 +12,7 @@ int test_too_many_arg(int p0) {
double test_int_builtin(double p0) {
return firstbithigh(p0);
- // expected-error@-1 {{call to 'firstbithigh' is ambiguous}}
+ // expected-error@-1 {{no matching function for call to 'firstbithigh'}}
}
double2 test_int_builtin_2(double2 p0) {
diff --git a/clang/test/SemaHLSL/Types/AggregateSplatConstantExpr.hlsl b/clang/test/SemaHLSL/Types/AggregateSplatConstantExpr.hlsl
new file mode 100644
index 0000000..630acd8
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/AggregateSplatConstantExpr.hlsl
@@ -0,0 +1,89 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -fnative-half-type -fnative-int16-type -std=hlsl202x -verify %s
+
+// expected-no-diagnostics
+
+struct Base {
+ double D;
+ uint64_t2 U;
+ int16_t I : 5;
+ uint16_t I2: 5;
+};
+
+struct R : Base {
+ int G : 10;
+ int : 30;
+ float F;
+};
+
+struct B1 {
+ float A;
+ float B;
+};
+
+struct B2 : B1 {
+ int C;
+ int D;
+ bool BB;
+};
+
+// tests for HLSLAggregateSplatCast
+export void fn() {
+ // result type vector
+ // splat from a vector of size 1
+
+ constexpr float1 Y = {1.0};
+ constexpr float4 F4 = (float4)Y;
+ _Static_assert(F4[0] == 1.0, "Woo!");
+ _Static_assert(F4[1] == 1.0, "Woo!");
+ _Static_assert(F4[2] == 1.0, "Woo!");
+ _Static_assert(F4[3] == 1.0, "Woo!");
+
+ // result type array
+ // splat from a scalar
+ constexpr float F = 3.33;
+ constexpr int B6[6] = (int[6])F;
+ _Static_assert(B6[0] == 3, "Woo!");
+ _Static_assert(B6[1] == 3, "Woo!");
+ _Static_assert(B6[2] == 3, "Woo!");
+ _Static_assert(B6[3] == 3, "Woo!");
+ _Static_assert(B6[4] == 3, "Woo!");
+ _Static_assert(B6[5] == 3, "Woo!");
+
+ // splat from a vector of size 1
+ constexpr int1 A1 = {1};
+ constexpr uint64_t2 A7[2] = (uint64_t2[2])A1;
+ _Static_assert(A7[0][0] == 1, "Woo!");
+ _Static_assert(A7[0][1] == 1, "Woo!");
+ _Static_assert(A7[1][0] == 1, "Woo!");
+ _Static_assert(A7[1][1] == 1, "Woo!");
+
+ // result type struct
+ // splat from a scalar
+ constexpr double D = 97.6789;
+ constexpr R SR = (R)(D + 3.0);
+ _Static_assert(SR.D == 100.6789, "Woo!");
+ _Static_assert(SR.U[0] == 100, "Woo!");
+ _Static_assert(SR.U[1] == 100, "Woo!");
+ _Static_assert(SR.I == 4, "Woo!");
+ _Static_assert(SR.I2 == 4, "Woo!");
+ _Static_assert(SR.G == 100, "Woo!");
+ _Static_assert(SR.F == 100.6789, "Woo!");
+
+ // splat from a vector of size 1
+ constexpr float1 A100 = {1000.1111};
+ constexpr B2 SB2 = (B2)A100;
+ _Static_assert(SB2.A == 1000.1111, "Woo!");
+ _Static_assert(SB2.B == 1000.1111, "Woo!");
+ _Static_assert(SB2.C == 1000, "Woo!");
+ _Static_assert(SB2.D == 1000, "Woo!");
+ _Static_assert(SB2.BB == true, "Woo!");
+
+ // splat from a bool to an int and float etc
+ constexpr bool B = true;
+ constexpr B2 SB3 = (B2)B;
+ _Static_assert(SB3.A == 1.0, "Woo!");
+ _Static_assert(SB3.B == 1.0, "Woo!");
+ _Static_assert(SB3.C == 1, "Woo!");
+ _Static_assert(SB3.D == 1, "Woo!");
+ _Static_assert(SB3.BB == true, "Woo!");
+}
diff --git a/clang/test/SemaHLSL/Types/ElementwiseCastConstantExpr.hlsl b/clang/test/SemaHLSL/Types/ElementwiseCastConstantExpr.hlsl
new file mode 100644
index 0000000..c9963c3
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/ElementwiseCastConstantExpr.hlsl
@@ -0,0 +1,90 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -fnative-half-type -fnative-int16-type -std=hlsl202x -verify %s
+
+// expected-no-diagnostics
+
+struct Base {
+ double D;
+ uint64_t2 U;
+ int16_t I : 5;
+ uint16_t I2: 5;
+};
+
+struct R : Base {
+ int G : 10;
+ int : 30;
+ float F;
+};
+
+struct B1 {
+ float A;
+ float B;
+};
+
+struct B2 : B1 {
+ int C;
+ int D;
+ bool BB;
+};
+
+export void fn() {
+ _Static_assert(((float4)(int[6]){1,2,3,4,5,6}).x == 1.0, "Woo!");
+
+ // This compiling successfully verifies that the array constant expression
+ // gets truncated to a float at compile time for instantiation via the
+ // flat cast
+ _Static_assert(((int)(int[2]){1,2}) == 1, "Woo!");
+
+ // truncation tests
+ // result type int
+ // truncate from struct
+ constexpr B1 SB1 = {1.0, 3.0};
+ constexpr int X = (int)SB1;
+ _Static_assert(X == 1, "Woo!");
+
+ // result type float
+ // truncate from array
+ constexpr B1 Arr[2] = {4.0, 3.0, 2.0, 1.0};
+ constexpr float F = (float)Arr;
+ _Static_assert(F == 4.0, "Woo!");
+
+ // result type vector
+ // truncate from array of vector
+ constexpr int2 Arr2[2] = {5,6,7,8};
+ constexpr int2 I2 = (int2)Arr2;
+ _Static_assert(I2[0] == 5, "Woo!");
+ _Static_assert(I2[1] == 6, "Woo!");
+
+ // lhs and rhs are same "size" tests
+
+ // result type vector from array
+ constexpr int4 I4 = (int4)Arr;
+ _Static_assert(I4[0] == 4, "Woo!");
+ _Static_assert(I4[1] == 3, "Woo!");
+ _Static_assert(I4[2] == 2, "Woo!");
+ _Static_assert(I4[3] == 1, "Woo!");
+
+ // result type array from vector
+ constexpr double3 D3 = {100.11, 200.11, 300.11};
+ constexpr float FArr[3] = (float[3])D3;
+ _Static_assert(FArr[0] == 100.11, "Woo!");
+ _Static_assert(FArr[1] == 200.11, "Woo!");
+ _Static_assert(FArr[2] == 300.11, "Woo!");
+
+ // result type struct from struct
+ constexpr B2 SB2 = {5.5, 6.5, 1000, 5000, false};
+ constexpr Base SB = (Base)SB2;
+ _Static_assert(SB.D == 5.5, "Woo!");
+ _Static_assert(SB.U[0] == 6, "Woo!");
+ _Static_assert(SB.U[1] == 1000, "Woo!");
+ _Static_assert(SB.I == 8, "Woo!");
+ _Static_assert(SB.I2 == 0, "Woo!");
+
+ // Make sure we read bitfields correctly
+ constexpr Base BB = {222.22, {100, 200}, -2, 7};
+ constexpr int Arr3[5] = (int[5])BB;
+ _Static_assert(Arr3[0] == 222, "Woo!");
+ _Static_assert(Arr3[1] == 100, "Woo!");
+ _Static_assert(Arr3[2] == 200, "Woo!");
+ _Static_assert(Arr3[3] == -2, "Woo!");
+ _Static_assert(Arr3[4] == 7, "Woo!");
+}
diff --git a/clang/test/SemaTemplate/temp_arg_nontype.cpp b/clang/test/SemaTemplate/temp_arg_nontype.cpp
index 7d2a010..bd0bf3c 100644
--- a/clang/test/SemaTemplate/temp_arg_nontype.cpp
+++ b/clang/test/SemaTemplate/temp_arg_nontype.cpp
@@ -173,8 +173,7 @@ namespace pr6249 {
}
namespace PR6723 {
- template<unsigned char C> void f(int (&a)[C]); // expected-note 3{{candidate template ignored: substitution failure [with C = '\x00']}}
- // expected-note@-1 {{not viable: no known conversion from 'int[512]' to 'int (&)[0]'}}
+ template<unsigned char C> void f(int (&a)[C]); // expected-note 4{{candidate template ignored: substitution failure [with C = '\x00']}}
void g() {
int arr512[512];
f(arr512); // expected-error{{no matching function for call}}
diff --git a/clang/test/SemaTemplate/temp_arg_nontype_cxx11.cpp b/clang/test/SemaTemplate/temp_arg_nontype_cxx11.cpp
index 5752cba..45bdb4c 100644
--- a/clang/test/SemaTemplate/temp_arg_nontype_cxx11.cpp
+++ b/clang/test/SemaTemplate/temp_arg_nontype_cxx11.cpp
@@ -43,7 +43,7 @@ void TempFunc() {}
void Useage() {
//expected-error@+2 {{no matching function}}
- //expected-note@-4 {{candidate template ignored: invalid explicitly-specified argument for template parameter 'b'}}
+ //expected-note@-4 {{candidate template ignored: substitution failure [with a = 1, b = 4294967295, c = 1]: non-type template argument evaluates to -1, which cannot be narrowed to type 'unsigned int'}}
TempFunc<1, -1, 1>();
}
}
diff --git a/clang/tools/clang-repl/ClangRepl.cpp b/clang/tools/clang-repl/ClangRepl.cpp
index c787942..c86a131 100644
--- a/clang/tools/clang-repl/ClangRepl.cpp
+++ b/clang/tools/clang-repl/ClangRepl.cpp
@@ -309,6 +309,7 @@ int main(int argc, const char **argv) {
clang::Interpreter::JITConfig Config;
Config.IsOutOfProcess = !OOPExecutor.empty() || !OOPExecutorConnect.empty();
Config.OOPExecutor = OOPExecutor;
+ Config.OrcRuntimePath = OrcRuntimePath;
auto SizeOrErr = getSlabAllocSize(SlabAllocateSizeString);
if (!SizeOrErr) {
llvm::logAllUnhandledErrors(SizeOrErr.takeError(), llvm::errs(), "error: ");
diff --git a/clang/www/c_status.html b/clang/www/c_status.html
index 80a52f7..2c1f6f4 100644
--- a/clang/www/c_status.html
+++ b/clang/www/c_status.html
@@ -344,7 +344,7 @@ conformance.</p>
<tr>
<td>static_assert without UB</td>
<td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3525.htm">N3525</a></td>
- <td class="unknown" align="center">Unknown</td>
+ <td class="full" align="center">Yes</td>
</tr>
<tr>
<td>Allow calling static inline within extern inline</td>
diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index 0496f24..46d6bb5 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -722,7 +722,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
/* Raw profile format version (start from 1). */
#define INSTR_PROF_RAW_VERSION 10
/* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 12
+#define INSTR_PROF_INDEX_VERSION 13
/* Coverage mapping format version (start from 0). */
#define INSTR_PROF_COVMAP_VERSION 6
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp
index a5ec85ae..72f4bbf 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp
@@ -45,7 +45,6 @@ struct MemoryMappedSegmentData {
const char *current_load_cmd_addr;
u32 lc_type;
uptr base_virt_addr;
- uptr addr_mask;
};
template <typename Section>
@@ -54,12 +53,58 @@ static void NextSectionLoad(LoadedModule *module, MemoryMappedSegmentData *data,
const Section *sc = (const Section *)data->current_load_cmd_addr;
data->current_load_cmd_addr += sizeof(Section);
- uptr sec_start = (sc->addr & data->addr_mask) + data->base_virt_addr;
+ uptr sec_start = sc->addr + data->base_virt_addr;
uptr sec_end = sec_start + sc->size;
module->addAddressRange(sec_start, sec_end, /*executable=*/false, isWritable,
sc->sectname);
}
+static bool VerifyMemoryMapping(MemoryMappingLayout* mapping) {
+ InternalMmapVector<LoadedModule> modules;
+ modules.reserve(128); // matches DumpProcessMap
+ mapping->DumpListOfModules(&modules);
+
+ InternalMmapVector<LoadedModule::AddressRange> segments;
+ for (uptr i = 0; i < modules.size(); ++i) {
+ for (auto& range : modules[i].ranges()) {
+ segments.push_back(range);
+ }
+ }
+
+ // Verify that none of the segments overlap:
+ // 1. Sort the segments by the start address
+ // 2. Check that every segment starts after the previous one ends.
+ Sort(segments.data(), segments.size(),
+ [](LoadedModule::AddressRange& a, LoadedModule::AddressRange& b) {
+ return a.beg < b.beg;
+ });
+
+ // To avoid spam, we only print the report message once-per-process.
+ static bool invalid_module_map_reported = false;
+ bool well_formed = true;
+
+ for (size_t i = 1; i < segments.size(); i++) {
+ uptr cur_start = segments[i].beg;
+ uptr prev_end = segments[i - 1].end;
+ if (cur_start < prev_end) {
+ well_formed = false;
+ VReport(2, "Overlapping mappings: %s start = %p, %s end = %p\n",
+ segments[i].name, (void*)cur_start, segments[i - 1].name,
+ (void*)prev_end);
+ if (!invalid_module_map_reported) {
+ Report(
+ "WARN: Invalid dyld module map detected. This is most likely a bug "
+ "in the sanitizer.\n");
+ Report("WARN: Backtraces may be unreliable.\n");
+ invalid_module_map_reported = true;
+ }
+ }
+ }
+
+ mapping->Reset();
+ return well_formed;
+}
+
void MemoryMappedSegment::AddAddressRanges(LoadedModule *module) {
// Don't iterate over sections when the caller hasn't set up the
// data pointer, when there are no sections, or when the segment
@@ -85,6 +130,7 @@ void MemoryMappedSegment::AddAddressRanges(LoadedModule *module) {
MemoryMappingLayout::MemoryMappingLayout(bool cache_enabled) {
Reset();
+ VerifyMemoryMapping(this);
}
MemoryMappingLayout::~MemoryMappingLayout() {
@@ -190,6 +236,7 @@ typedef struct dyld_shared_cache_dylib_text_info
extern bool _dyld_get_shared_cache_uuid(uuid_t uuid);
extern const void *_dyld_get_shared_cache_range(size_t *length);
+extern intptr_t _dyld_get_image_slide(const struct mach_header* mh);
extern int dyld_shared_cache_iterate_text(
const uuid_t cacheUuid,
void (^callback)(const dyld_shared_cache_dylib_text_info *info));
@@ -258,23 +305,21 @@ static bool NextSegmentLoad(MemoryMappedSegment *segment,
layout_data->current_load_cmd_count--;
if (((const load_command *)lc)->cmd == kLCSegment) {
const SegmentCommand* sc = (const SegmentCommand *)lc;
- uptr base_virt_addr, addr_mask;
- if (layout_data->current_image == kDyldImageIdx) {
- base_virt_addr = (uptr)get_dyld_hdr();
- // vmaddr is masked with 0xfffff because on macOS versions < 10.12,
- // it contains an absolute address rather than an offset for dyld.
- // To make matters even more complicated, this absolute address
- // isn't actually the absolute segment address, but the offset portion
- // of the address is accurate when combined with the dyld base address,
- // and the mask will give just this offset.
- addr_mask = 0xfffff;
- } else {
+ if (strncmp(sc->segname, "__LINKEDIT", sizeof("__LINKEDIT")) == 0) {
+ // The LINKEDIT sections are for internal linker use, and may alias
+ // with the LINKEDIT section for other modules. (If we included them,
+ // our memory map would contain overlappping sections.)
+ return false;
+ }
+
+ uptr base_virt_addr;
+ if (layout_data->current_image == kDyldImageIdx)
+ base_virt_addr = (uptr)_dyld_get_image_slide(get_dyld_hdr());
+ else
base_virt_addr =
(uptr)_dyld_get_image_vmaddr_slide(layout_data->current_image);
- addr_mask = ~0;
- }
- segment->start = (sc->vmaddr & addr_mask) + base_virt_addr;
+ segment->start = sc->vmaddr + base_virt_addr;
segment->end = segment->start + sc->vmsize;
// Most callers don't need section information, so only fill this struct
// when required.
@@ -284,7 +329,6 @@ static bool NextSegmentLoad(MemoryMappedSegment *segment,
(const char *)lc + sizeof(SegmentCommand);
seg_data->lc_type = kLCSegment;
seg_data->base_virt_addr = base_virt_addr;
- seg_data->addr_mask = addr_mask;
internal_strncpy(seg_data->name, sc->segname,
ARRAY_SIZE(seg_data->name));
}
diff --git a/compiler-rt/test/asan/TestCases/Darwin/asan-verify-module-map.cpp b/compiler-rt/test/asan/TestCases/Darwin/asan-verify-module-map.cpp
new file mode 100644
index 0000000..7660841
--- /dev/null
+++ b/compiler-rt/test/asan/TestCases/Darwin/asan-verify-module-map.cpp
@@ -0,0 +1,25 @@
+// This test simply checks that the "Invalid dyld module map" warning is not printed
+// in the output of a backtrace.
+
+// RUN: %clangxx_asan -DSHARED_LIB -g %s -dynamiclib -o %t.dylib
+// RUN: %clangxx_asan -O0 -g %s %t.dylib -o %t.executable
+// RUN: %env_asan_opts="print_module_map=2" not %run %t.executable 2>&1 | FileCheck %s -DDYLIB=%t.dylib
+
+// CHECK-NOT: WARN: Invalid dyld module map
+// CHECK-DAG: 0x{{.*}}-0x{{.*}} [[DYLIB]]
+// CHECK-DAG: 0x{{.*}}-0x{{.*}} {{.*}}libsystem
+
+#ifdef SHARED_LIB
+extern "C" void foo(int *a) { *a = 5; }
+#else
+# include <cstdlib>
+
+extern "C" void foo(int *a);
+
+int main() {
+ int *a = (int *)malloc(sizeof(int));
+ free(a);
+ foo(a);
+ return 0;
+}
+#endif \ No newline at end of file
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index 9f7c10c..891373e 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -450,6 +450,41 @@ mlir::Value inlineElementalOp(
mlir::IRMapping &mapper,
const std::function<bool(hlfir::ElementalOp)> &mustRecursivelyInline);
+/// Generate an element-by-element assignment from \p rhs to \p lhs for arrays
+/// that are known not to alias. The assignment is performed using a loop nest
+/// over the optimal extents deduced from both shapes. If \p emitWorkshareLoop
+/// is true, a workshare loop construct may be emitted when available.
+/// Allocatable LHS must be allocated with the right shape and parameters.
+void genNoAliasArrayAssignment(
+ mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
+ hlfir::Entity lhs, bool emitWorkshareLoop = false,
+ bool temporaryLHS = false,
+ std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
+ hlfir::Entity, hlfir::Entity)> *combiner =
+ nullptr);
+
+/// Generate an assignment from \p rhs to \p lhs when they are known not to
+/// alias. Handles both arrays and scalars: for arrays, delegates to
+/// genNoAliasArrayAssignment; for scalars, performs load/store for trivial
+/// scalar types and falls back to hlfir.assign otherwise.
+/// Allocatable LHS must be allocated with the right shape and parameters.
+void genNoAliasAssignment(
+ mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
+ hlfir::Entity lhs, bool emitWorkshareLoop = false,
+ bool temporaryLHS = false,
+ std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
+ hlfir::Entity, hlfir::Entity)> *combiner =
+ nullptr);
+inline void genNoAliasAssignment(
+ mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
+ hlfir::Entity lhs, bool emitWorkshareLoop, bool temporaryLHS,
+ std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
+ hlfir::Entity, hlfir::Entity)>
+ combiner) {
+ genNoAliasAssignment(loc, builder, rhs, lhs, emitWorkshareLoop, temporaryLHS,
+ &combiner);
+}
+
/// Create a new temporary with the shape and parameters of the provided
/// hlfir.eval_in_mem operation and clone the body of the hlfir.eval_in_mem
/// operating on this new temporary. returns the temporary and whether the
diff --git a/flang/include/flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h b/flang/include/flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h
new file mode 100644
index 0000000..c798681
--- /dev/null
+++ b/flang/include/flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h
@@ -0,0 +1,51 @@
+//===- FIROpenACCSupportAnalysis.h - FIR OpenACCSupport Analysis ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the FIR-specific implementation of OpenACCSupport analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_OPENACC_ANALYSIS_FIROPENACCSUPPORTANALYSIS_H
+#define FORTRAN_OPTIMIZER_OPENACC_ANALYSIS_FIROPENACCSUPPORTANALYSIS_H
+
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Value.h"
+#include <string>
+
+namespace fir {
+namespace acc {
+
+/// FIR-specific implementation for the OpenACCSupport analysis interface.
+///
+/// This class provides the custom implementations of the OpenACCSupport
+/// interface methods that are tailored to FIR's requirements and
+/// can handle FIR dialect operations and types.
+/// Its primary intent is to be registered with the OpenACCSupport analysis
+/// using setImplementation()
+///
+/// Usage:
+/// auto &support = getAnalysis<mlir::acc::OpenACCSupport>();
+/// support.setImplementation(fir::acc::FIROpenACCSupportAnalysis());
+///
+class FIROpenACCSupportAnalysis {
+public:
+ FIROpenACCSupportAnalysis() = default;
+
+ std::string getVariableName(mlir::Value v);
+
+ std::string getRecipeName(mlir::acc::RecipeKind kind, mlir::Type type,
+ mlir::Value var);
+
+ mlir::InFlightDiagnostic emitNYI(mlir::Location loc,
+ const mlir::Twine &message);
+};
+
+} // namespace acc
+} // namespace fir
+
+#endif // FORTRAN_OPTIMIZER_OPENACC_ANALYSIS_FIROPENACCSUPPORTANALYSIS_H
diff --git a/flang/include/flang/Optimizer/OpenACC/Passes.h b/flang/include/flang/Optimizer/OpenACC/Passes.h
index 0627cc8..c27c7ebc 100644
--- a/flang/include/flang/Optimizer/OpenACC/Passes.h
+++ b/flang/include/flang/Optimizer/OpenACC/Passes.h
@@ -13,6 +13,9 @@
#ifndef FORTRAN_OPTIMIZER_OPENACC_PASSES_H
#define FORTRAN_OPTIMIZER_OPENACC_PASSES_H
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassRegistry.h"
@@ -25,6 +28,7 @@ namespace acc {
#define GEN_PASS_REGISTRATION
#include "flang/Optimizer/OpenACC/Passes.h.inc"
+std::unique_ptr<mlir::Pass> createACCInitializeFIRAnalysesPass();
std::unique_ptr<mlir::Pass> createACCRecipeBufferizationPass();
} // namespace acc
diff --git a/flang/include/flang/Optimizer/OpenACC/Passes.td b/flang/include/flang/Optimizer/OpenACC/Passes.td
index 3c127b3..d947aa4 100644
--- a/flang/include/flang/Optimizer/OpenACC/Passes.td
+++ b/flang/include/flang/Optimizer/OpenACC/Passes.td
@@ -11,6 +11,22 @@
include "mlir/Pass/PassBase.td"
+def ACCInitializeFIRAnalyses
+ : Pass<"acc-initialize-fir-analyses", "mlir::ModuleOp"> {
+ let summary = "Initialize FIR analyses for OpenACC passes";
+ let description = [{
+ This pass initializes analyses that can be used by subsequent OpenACC passes
+ in the pipeline. It creates and caches the OpenACCSupport analysis with a
+ FIR-specific implementation that can handle FIR types and operations.
+ It also initializes FIR's AliasAnalysis for use in OpenACC passes.
+ This pass needs to rerun if any analyses were invalidated by MLIR's framework.
+ }];
+ // In addition to pre-registering the needed analyses, this pass also
+ // pre-registers the dialects that various OpenACC passes may generate.
+ let dependentDialects = ["fir::FIROpsDialect", "hlfir::hlfirDialect",
+ "mlir::acc::OpenACCDialect"];
+}
+
def ACCRecipeBufferization
: Pass<"fir-acc-recipe-bufferization", "mlir::ModuleOp"> {
let summary = "Rewrite acc.*.recipe box values to ref<box> and update uses";
diff --git a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h
new file mode 100644
index 0000000..5ca0925
--- /dev/null
+++ b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h
@@ -0,0 +1,57 @@
+//===- FIROpenACCUtils.h - FIR OpenACC Utilities ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares utility functions for FIR OpenACC support.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_OPENACC_SUPPORT_FIROPENACCUTILS_H
+#define FORTRAN_OPTIMIZER_OPENACC_SUPPORT_FIROPENACCUTILS_H
+
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Value.h"
+#include <string>
+
+namespace fir {
+namespace acc {
+
+/// Attempts to extract the variable name from a value by walking through
+/// FIR operations and looking for variable names.
+/// \param v The value to extract the variable name from
+/// \param preferDemangledName If true, prefers demangled/bindc names over
+/// mangled/unique names. If false, prefers mangled names.
+/// Returns empty string if no name is found.
+std::string getVariableName(mlir::Value v, bool preferDemangledName = true);
+
+/// Get the recipe name for a given recipe kind, FIR type, and optional
+/// variable. Uses FIR's type string representation with appropriate prefix. For
+/// firstprivate and reduction recipes, handles bounds suffix when all bounds
+/// are constant. For reduction recipes, embeds the operator name in the recipe.
+/// \param kind The recipe kind (private, firstprivate, or reduction)
+/// \param type The FIR type (must be a FIR type)
+/// \param var Optional variable value
+/// \param bounds Optional bounds for array sections (used for suffix
+/// generation)
+/// \param reductionOp Optional reduction operator (required for reduction
+/// recipes)
+/// \return The complete recipe name with all necessary suffixes
+std::string getRecipeName(mlir::acc::RecipeKind kind, mlir::Type type,
+ mlir::Value var = nullptr,
+ llvm::ArrayRef<mlir::Value> bounds = {},
+ mlir::acc::ReductionOperator reductionOp =
+ mlir::acc::ReductionOperator::AccNone);
+
+/// Check if all bounds are expressed with constant values.
+/// \param bounds Array of DataBoundsOp values to check
+/// \return true if all bounds have constant lowerbound/upperbound or extent
+bool areAllBoundsConstant(llvm::ArrayRef<mlir::Value> bounds);
+
+} // namespace acc
+} // namespace fir
+
+#endif // FORTRAN_OPTIMIZER_OPENACC_SUPPORT_FIROPENACCUTILS_H
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 548ca67..f05c4cf 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -595,9 +595,15 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
// -cc1` does accept multiple action options, but will only consider the
// rightmost one.
if (args.hasMultipleArgs(clang::driver::options::OPT_Action_Group)) {
- const unsigned diagID = diags.getCustomDiagID(
- clang::DiagnosticsEngine::Error, "Only one action option is allowed");
- diags.Report(diagID);
+ llvm::SmallString<32> buf;
+ llvm::raw_svector_ostream os(buf);
+ for (const llvm::opt::Arg *arg :
+ args.filtered(clang::driver::options::OPT_Action_Group)) {
+ if (buf.size())
+ os << ", ";
+ os << "'" << arg->getSpelling() << "'";
+ }
+ diags.Report(clang::diag::err_drv_too_many_actions) << buf;
return false;
}
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 1f75ed1..bb4c95a 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -28,6 +28,7 @@
#include "flang/Optimizer/Builder/IntrinsicCall.h"
#include "flang/Optimizer/Builder/Todo.h"
#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h"
#include "flang/Parser/parse-tree-visitor.h"
#include "flang/Parser/parse-tree.h"
#include "flang/Parser/tools.h"
@@ -1159,18 +1160,6 @@ bool isConstantBound(mlir::acc::DataBoundsOp &op) {
return false;
}
-/// Return true iff all the bounds are expressed with constant values.
-bool areAllBoundConstant(const llvm::SmallVector<mlir::Value> &bounds) {
- for (auto bound : bounds) {
- auto dataBound =
- mlir::dyn_cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
- assert(dataBound && "Must be DataBoundOp operation");
- if (!isConstantBound(dataBound))
- return false;
- }
- return true;
-}
-
static llvm::SmallVector<mlir::Value>
genConstantBounds(fir::FirOpBuilder &builder, mlir::Location loc,
mlir::acc::DataBoundsOp &dataBound) {
@@ -1196,59 +1185,6 @@ genConstantBounds(fir::FirOpBuilder &builder, mlir::Location loc,
return {lb, ub, step};
}
-static mlir::Value genShapeFromBoundsOrArgs(
- mlir::Location loc, fir::FirOpBuilder &builder, fir::SequenceType seqTy,
- const llvm::SmallVector<mlir::Value> &bounds, mlir::ValueRange arguments) {
- llvm::SmallVector<mlir::Value> args;
- if (bounds.empty() && seqTy) {
- if (seqTy.hasDynamicExtents()) {
- assert(!arguments.empty() && "arguments must hold the entity");
- auto entity = hlfir::Entity{arguments[0]};
- return hlfir::genShape(loc, builder, entity);
- }
- return genShapeOp(builder, seqTy, loc).getResult();
- } else if (areAllBoundConstant(bounds)) {
- for (auto bound : llvm::reverse(bounds)) {
- auto dataBound =
- mlir::cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
- args.append(genConstantBounds(builder, loc, dataBound));
- }
- } else {
- assert(((arguments.size() - 2) / 3 == seqTy.getDimension()) &&
- "Expect 3 block arguments per dimension");
- for (auto arg : arguments.drop_front(2))
- args.push_back(arg);
- }
-
- assert(args.size() % 3 == 0 && "Triplets must be a multiple of 3");
- llvm::SmallVector<mlir::Value> extents;
- mlir::Type idxTy = builder.getIndexType();
- mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
- mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0);
- for (unsigned i = 0; i < args.size(); i += 3) {
- mlir::Value s1 =
- mlir::arith::SubIOp::create(builder, loc, args[i + 1], args[0]);
- mlir::Value s2 = mlir::arith::AddIOp::create(builder, loc, s1, one);
- mlir::Value s3 =
- mlir::arith::DivSIOp::create(builder, loc, s2, args[i + 2]);
- mlir::Value cmp = mlir::arith::CmpIOp::create(
- builder, loc, mlir::arith::CmpIPredicate::sgt, s3, zero);
- mlir::Value ext =
- mlir::arith::SelectOp::create(builder, loc, cmp, s3, zero);
- extents.push_back(ext);
- }
- return fir::ShapeOp::create(builder, loc, extents);
-}
-
-static hlfir::DesignateOp::Subscripts
-getSubscriptsFromArgs(mlir::ValueRange args) {
- hlfir::DesignateOp::Subscripts triplets;
- for (unsigned i = 2; i < args.size(); i += 3)
- triplets.emplace_back(
- hlfir::DesignateOp::Triplet{args[i], args[i + 1], args[i + 2]});
- return triplets;
-}
-
static hlfir::Entity genDesignateWithTriplets(
fir::FirOpBuilder &builder, mlir::Location loc, hlfir::Entity &entity,
hlfir::DesignateOp::Subscripts &triplets, mlir::Value shape) {
@@ -1262,19 +1198,88 @@ static hlfir::Entity genDesignateWithTriplets(
return hlfir::Entity{designate.getResult()};
}
-mlir::acc::FirstprivateRecipeOp Fortran::lower::createOrGetFirstprivateRecipe(
- fir::FirOpBuilder &builder, llvm::StringRef recipeName, mlir::Location loc,
- mlir::Type ty, llvm::SmallVector<mlir::Value> &bounds) {
- mlir::ModuleOp mod =
- builder.getBlock()->getParent()->getParentOfType<mlir::ModuleOp>();
- if (auto recipe =
- mod.lookupSymbol<mlir::acc::FirstprivateRecipeOp>(recipeName))
- return recipe;
+// Designate uses triplets based on object lower bounds while acc.bounds are
+// zero based. This helper shift the bounds to create the designate triplets.
+static hlfir::DesignateOp::Subscripts
+genTripletsFromAccBounds(fir::FirOpBuilder &builder, mlir::Location loc,
+ const llvm::SmallVector<mlir::Value> &accBounds,
+ hlfir::Entity entity) {
+ assert(entity.getRank() * 3 == static_cast<int>(accBounds.size()) &&
+ "must get lb,ub,step for each dimension");
+ hlfir::DesignateOp::Subscripts triplets;
+ for (unsigned i = 0; i < accBounds.size(); i += 3) {
+ mlir::Value lb = hlfir::genLBound(loc, builder, entity, i / 3);
+ lb = builder.createConvert(loc, accBounds[i].getType(), lb);
+ assert(accBounds[i].getType() == accBounds[i + 1].getType() &&
+ "mix of integer types in triplets");
+ mlir::Value sliceLB =
+ builder.createOrFold<mlir::arith::AddIOp>(loc, accBounds[i], lb);
+ mlir::Value sliceUB =
+ builder.createOrFold<mlir::arith::AddIOp>(loc, accBounds[i + 1], lb);
+ triplets.emplace_back(
+ hlfir::DesignateOp::Triplet{sliceLB, sliceUB, accBounds[i + 2]});
+ }
+ return triplets;
+}
- auto ip = builder.saveInsertionPoint();
- auto recipe = genRecipeOp<mlir::acc::FirstprivateRecipeOp>(
- builder, mod, recipeName, loc, ty);
- bool allConstantBound = areAllBoundConstant(bounds);
+static std::pair<hlfir::Entity, hlfir::Entity>
+genArraySectionsInRecipe(fir::FirOpBuilder &builder, mlir::Location loc,
+ llvm::SmallVector<mlir::Value> &dataOperationBounds,
+ mlir::ValueRange recipeArguments,
+ bool allConstantBound, hlfir::Entity lhs,
+ hlfir::Entity rhs) {
+ lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs);
+ rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs);
+ // Get the list of lb,ub,step values for the sections that can be used inside
+ // the recipe region.
+ llvm::SmallVector<mlir::Value> bounds;
+ if (allConstantBound) {
+ // For constant bounds, the bounds are not region arguments. Materialize
+ // constants looking at the IR for the bounds on the data operation.
+ for (auto bound : dataOperationBounds) {
+ auto dataBound =
+ mlir::cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
+ bounds.append(genConstantBounds(builder, loc, dataBound));
+ }
+ } else {
+ // If one bound is not constant, all of the bounds are region arguments.
+ for (auto arg : recipeArguments.drop_front(2))
+ bounds.push_back(arg);
+ }
+ // Compute the fir.shape of the array section and the triplets to create
+ // hlfir.designate.
+ assert(lhs.getRank() * 3 == static_cast<int>(bounds.size()) &&
+ "must get lb,ub,step for each dimension");
+ llvm::SmallVector<mlir::Value> extents;
+ mlir::Type idxTy = builder.getIndexType();
+ for (unsigned i = 0; i < bounds.size(); i += 3)
+ extents.push_back(builder.genExtentFromTriplet(
+ loc, bounds[i], bounds[i + 1], bounds[i + 2], idxTy));
+ mlir::Value shape = fir::ShapeOp::create(builder, loc, extents);
+ hlfir::DesignateOp::Subscripts rhsTriplets =
+ genTripletsFromAccBounds(builder, loc, bounds, rhs);
+ hlfir::DesignateOp::Subscripts lhsTriplets;
+ // Share the bounds when both rhs/lhs are known to be 1-based to avoid noise
+ // in the IR for the most common cases.
+ if (!lhs.mayHaveNonDefaultLowerBounds() &&
+ !rhs.mayHaveNonDefaultLowerBounds())
+ lhsTriplets = rhsTriplets;
+ else
+ lhsTriplets = genTripletsFromAccBounds(builder, loc, bounds, lhs);
+ hlfir::Entity leftSection =
+ genDesignateWithTriplets(builder, loc, lhs, lhsTriplets, shape);
+ hlfir::Entity rightSection =
+ genDesignateWithTriplets(builder, loc, rhs, rhsTriplets, shape);
+ return {leftSection, rightSection};
+}
+
+// Generate the combiner or copy region block and block arguments and return the
+// source and destination entities.
+static std::pair<hlfir::Entity, hlfir::Entity>
+genRecipeCombinerOrCopyRegion(fir::FirOpBuilder &builder, mlir::Location loc,
+ mlir::Type ty, mlir::Region &region,
+ llvm::SmallVector<mlir::Value> &bounds,
+ bool allConstantBound) {
llvm::SmallVector<mlir::Type> argsTy{ty, ty};
llvm::SmallVector<mlir::Location> argsLoc{loc, loc};
if (!allConstantBound) {
@@ -1289,100 +1294,57 @@ mlir::acc::FirstprivateRecipeOp Fortran::lower::createOrGetFirstprivateRecipe(
argsLoc.push_back(dataBound.getStartIdx().getLoc());
}
}
- builder.createBlock(&recipe.getCopyRegion(), recipe.getCopyRegion().end(),
- argsTy, argsLoc);
+ mlir::Block *block =
+ builder.createBlock(&region, region.end(), argsTy, argsLoc);
+ builder.setInsertionPointToEnd(&region.back());
+ return {hlfir::Entity{block->getArgument(0)},
+ hlfir::Entity{block->getArgument(1)}};
+}
- builder.setInsertionPointToEnd(&recipe.getCopyRegion().back());
- ty = fir::unwrapRefType(ty);
- if (fir::isa_trivial(ty)) {
- mlir::Value initValue = fir::LoadOp::create(
- builder, loc, recipe.getCopyRegion().front().getArgument(0));
- fir::StoreOp::create(builder, loc, initValue,
- recipe.getCopyRegion().front().getArgument(1));
- } else if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(ty)) {
- fir::FirOpBuilder firBuilder{builder, recipe.getOperation()};
- auto shape = genShapeFromBoundsOrArgs(
- loc, firBuilder, seqTy, bounds, recipe.getCopyRegion().getArguments());
-
- auto leftDeclOp = hlfir::DeclareOp::create(
- builder, loc, recipe.getCopyRegion().getArgument(0), llvm::StringRef{},
- shape);
- auto rightDeclOp = hlfir::DeclareOp::create(
- builder, loc, recipe.getCopyRegion().getArgument(1), llvm::StringRef{},
- shape);
-
- hlfir::DesignateOp::Subscripts triplets =
- getSubscriptsFromArgs(recipe.getCopyRegion().getArguments());
- auto leftEntity = hlfir::Entity{leftDeclOp.getBase()};
- auto left =
- genDesignateWithTriplets(firBuilder, loc, leftEntity, triplets, shape);
- auto rightEntity = hlfir::Entity{rightDeclOp.getBase()};
- auto right =
- genDesignateWithTriplets(firBuilder, loc, rightEntity, triplets, shape);
-
- hlfir::AssignOp::create(firBuilder, loc, left, right);
-
- } else if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(ty)) {
- fir::FirOpBuilder firBuilder{builder, recipe.getOperation()};
- llvm::SmallVector<mlir::Value> tripletArgs;
- mlir::Type innerTy = fir::extractSequenceType(boxTy);
- fir::SequenceType seqTy =
- mlir::dyn_cast_or_null<fir::SequenceType>(innerTy);
- if (!seqTy)
- TODO(loc, "Unsupported boxed type in OpenACC firstprivate");
-
- auto shape = genShapeFromBoundsOrArgs(
- loc, firBuilder, seqTy, bounds, recipe.getCopyRegion().getArguments());
- hlfir::DesignateOp::Subscripts triplets =
- getSubscriptsFromArgs(recipe.getCopyRegion().getArguments());
- auto leftEntity = hlfir::Entity{recipe.getCopyRegion().getArgument(0)};
- auto left =
- genDesignateWithTriplets(firBuilder, loc, leftEntity, triplets, shape);
- auto rightEntity = hlfir::Entity{recipe.getCopyRegion().getArgument(1)};
- auto right =
- genDesignateWithTriplets(firBuilder, loc, rightEntity, triplets, shape);
- hlfir::AssignOp::create(firBuilder, loc, left, right);
- } else {
- // Copy scalar derived type.
- // The temporary_lhs flag allows indicating that user defined assignments
- // should not be called while copying components, and that the LHS and RHS
- // are known to not alias since the LHS is a created object.
- hlfir::AssignOp::create(
- builder, loc, recipe.getCopyRegion().getArgument(0),
- recipe.getCopyRegion().getArgument(1), /*realloc=*/false,
- /*keep_lhs_length_if_realloc=*/false, /*temporary_lhs=*/true);
- }
+mlir::acc::FirstprivateRecipeOp Fortran::lower::createOrGetFirstprivateRecipe(
+ fir::FirOpBuilder &builder, llvm::StringRef recipeName, mlir::Location loc,
+ mlir::Type ty, llvm::SmallVector<mlir::Value> &bounds) {
+ mlir::ModuleOp mod =
+ builder.getBlock()->getParent()->getParentOfType<mlir::ModuleOp>();
+ if (auto recipe =
+ mod.lookupSymbol<mlir::acc::FirstprivateRecipeOp>(recipeName))
+ return recipe;
- mlir::acc::TerminatorOp::create(builder, loc);
- builder.restoreInsertionPoint(ip);
- return recipe;
-}
+ mlir::OpBuilder::InsertionGuard guard(builder);
+ auto recipe = genRecipeOp<mlir::acc::FirstprivateRecipeOp>(
+ builder, mod, recipeName, loc, ty);
+ bool allConstantBound = fir::acc::areAllBoundsConstant(bounds);
+ auto [source, destination] = genRecipeCombinerOrCopyRegion(
+ builder, loc, ty, recipe.getCopyRegion(), bounds, allConstantBound);
+
+ fir::FirOpBuilder firBuilder{builder, recipe.getOperation()};
+
+ source = hlfir::derefPointersAndAllocatables(loc, builder, source);
+ destination = hlfir::derefPointersAndAllocatables(loc, builder, destination);
-/// Get a string representation of the bounds.
-std::string getBoundsString(llvm::SmallVector<mlir::Value> &bounds) {
- std::stringstream boundStr;
if (!bounds.empty())
- boundStr << "_section_";
- llvm::interleave(
- bounds,
- [&](mlir::Value bound) {
- auto boundsOp =
- mlir::cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
- if (boundsOp.getLowerbound() &&
- fir::getIntIfConstant(boundsOp.getLowerbound()) &&
- boundsOp.getUpperbound() &&
- fir::getIntIfConstant(boundsOp.getUpperbound())) {
- boundStr << "lb" << *fir::getIntIfConstant(boundsOp.getLowerbound())
- << ".ub" << *fir::getIntIfConstant(boundsOp.getUpperbound());
- } else if (boundsOp.getExtent() &&
- fir::getIntIfConstant(boundsOp.getExtent())) {
- boundStr << "ext" << *fir::getIntIfConstant(boundsOp.getExtent());
- } else {
- boundStr << "?";
- }
- },
- [&] { boundStr << "x"; });
- return boundStr.str();
+ std::tie(source, destination) = genArraySectionsInRecipe(
+ firBuilder, loc, bounds, recipe.getCopyRegion().getArguments(),
+ allConstantBound, source, destination);
+ // The source and the destination of the firstprivate copy cannot alias,
+ // the destination is already properly allocated, so a simple assignment
+ // can be generated right away to avoid ending-up with runtime calls
+ // for arrays of numerical, logical and, character types.
+ //
+ // The temporary_lhs flag allows indicating that user defined assignments
+ // should not be called while copying components, and that the LHS and RHS
+ // are known to not alias since the LHS is a created object.
+ //
+ // TODO: detect cases where user defined assignment is needed and add a TODO.
+ // using temporary_lhs allows more aggressive optimizations of simple derived
+ // types. Existing compilers supporting OpenACC do not call user defined
+ // assignments, some use case is needed to decide what to do.
+ source = hlfir::loadTrivialScalar(loc, builder, source);
+ hlfir::AssignOp::create(builder, loc, source, destination, /*realloc=*/false,
+ /*keep_lhs_length_if_realloc=*/false,
+ /*temporary_lhs=*/true);
+ mlir::acc::TerminatorOp::create(builder, loc);
+ return recipe;
}
/// Rebuild the array type from the acc.bounds operation with constant
@@ -1458,9 +1420,8 @@ static void genPrivatizationRecipes(
RecipeOp recipe;
mlir::Type retTy = getTypeFromBounds(bounds, info.addr.getType());
if constexpr (std::is_same_v<RecipeOp, mlir::acc::PrivateRecipeOp>) {
- std::string recipeName =
- fir::getTypeAsString(retTy, converter.getKindMap(),
- Fortran::lower::privatizationRecipePrefix);
+ std::string recipeName = fir::acc::getRecipeName(
+ mlir::acc::RecipeKind::private_recipe, retTy, info.addr, bounds);
recipe = Fortran::lower::createOrGetPrivateRecipe(builder, recipeName,
operandLocation, retTy);
auto op = createDataEntryOp<mlir::acc::PrivateOp>(
@@ -1474,10 +1435,8 @@ static void genPrivatizationRecipes(
symbolPairs->emplace_back(op.getAccVar(),
Fortran::semantics::SymbolRef(symbol));
} else {
- std::string suffix =
- areAllBoundConstant(bounds) ? getBoundsString(bounds) : "";
- std::string recipeName = fir::getTypeAsString(
- retTy, converter.getKindMap(), "firstprivatization" + suffix);
+ std::string recipeName = fir::acc::getRecipeName(
+ mlir::acc::RecipeKind::firstprivate_recipe, retTy, info.addr, bounds);
recipe = Fortran::lower::createOrGetFirstprivateRecipe(
builder, recipeName, operandLocation, retTy, bounds);
auto op = createDataEntryOp<mlir::acc::FirstprivateOp>(
@@ -1611,205 +1570,6 @@ static mlir::Value genScalarCombiner(fir::FirOpBuilder &builder,
TODO(loc, "reduction operator");
}
-static hlfir::DesignateOp::Subscripts
-getTripletsFromArgs(mlir::acc::ReductionRecipeOp recipe) {
- hlfir::DesignateOp::Subscripts triplets;
- for (unsigned i = 2; i < recipe.getCombinerRegion().getArguments().size();
- i += 3)
- triplets.emplace_back(hlfir::DesignateOp::Triplet{
- recipe.getCombinerRegion().getArgument(i),
- recipe.getCombinerRegion().getArgument(i + 1),
- recipe.getCombinerRegion().getArgument(i + 2)});
- return triplets;
-}
-
-static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
- mlir::acc::ReductionOperator op, mlir::Type ty,
- mlir::Value value1, mlir::Value value2,
- mlir::acc::ReductionRecipeOp &recipe,
- llvm::SmallVector<mlir::Value> &bounds,
- bool allConstantBound) {
- ty = fir::unwrapRefType(ty);
-
- if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty)) {
- mlir::Type refTy = fir::ReferenceType::get(seqTy.getEleTy());
- llvm::SmallVector<fir::DoLoopOp> loops;
- llvm::SmallVector<mlir::Value> ivs;
- if (seqTy.hasDynamicExtents()) {
- auto shape =
- genShapeFromBoundsOrArgs(loc, builder, seqTy, bounds,
- recipe.getCombinerRegion().getArguments());
- auto v1DeclareOp = hlfir::DeclareOp::create(builder, loc, value1,
- llvm::StringRef{}, shape);
- auto v2DeclareOp = hlfir::DeclareOp::create(builder, loc, value2,
- llvm::StringRef{}, shape);
- hlfir::DesignateOp::Subscripts triplets = getTripletsFromArgs(recipe);
-
- llvm::SmallVector<mlir::Value> lenParamsLeft;
- auto leftEntity = hlfir::Entity{v1DeclareOp.getBase()};
- hlfir::genLengthParameters(loc, builder, leftEntity, lenParamsLeft);
- auto leftDesignate = hlfir::DesignateOp::create(
- builder, loc, v1DeclareOp.getBase().getType(), v1DeclareOp.getBase(),
- /*component=*/"",
- /*componentShape=*/mlir::Value{}, triplets,
- /*substring=*/mlir::ValueRange{}, /*complexPartAttr=*/std::nullopt,
- shape, lenParamsLeft);
- auto left = hlfir::Entity{leftDesignate.getResult()};
-
- llvm::SmallVector<mlir::Value> lenParamsRight;
- auto rightEntity = hlfir::Entity{v2DeclareOp.getBase()};
- hlfir::genLengthParameters(loc, builder, rightEntity, lenParamsLeft);
- auto rightDesignate = hlfir::DesignateOp::create(
- builder, loc, v2DeclareOp.getBase().getType(), v2DeclareOp.getBase(),
- /*component=*/"",
- /*componentShape=*/mlir::Value{}, triplets,
- /*substring=*/mlir::ValueRange{}, /*complexPartAttr=*/std::nullopt,
- shape, lenParamsRight);
- auto right = hlfir::Entity{rightDesignate.getResult()};
-
- llvm::SmallVector<mlir::Value, 1> typeParams;
- auto genKernel = [&builder, &loc, op, seqTy, &left, &right](
- mlir::Location l, fir::FirOpBuilder &b,
- mlir::ValueRange oneBasedIndices) -> hlfir::Entity {
- auto leftElement = hlfir::getElementAt(l, b, left, oneBasedIndices);
- auto rightElement = hlfir::getElementAt(l, b, right, oneBasedIndices);
- auto leftVal = hlfir::loadTrivialScalar(l, b, leftElement);
- auto rightVal = hlfir::loadTrivialScalar(l, b, rightElement);
- return hlfir::Entity{genScalarCombiner(
- builder, loc, op, seqTy.getEleTy(), leftVal, rightVal)};
- };
- mlir::Value elemental = hlfir::genElementalOp(
- loc, builder, seqTy.getEleTy(), shape, typeParams, genKernel,
- /*isUnordered=*/true);
- hlfir::AssignOp::create(builder, loc, elemental, v1DeclareOp.getBase());
- return;
- }
- if (bounds.empty()) {
- llvm::SmallVector<mlir::Value> extents;
- mlir::Type idxTy = builder.getIndexType();
- for (auto extent : llvm::reverse(seqTy.getShape())) {
- mlir::Value lb = mlir::arith::ConstantOp::create(
- builder, loc, idxTy, builder.getIntegerAttr(idxTy, 0));
- mlir::Value ub = mlir::arith::ConstantOp::create(
- builder, loc, idxTy, builder.getIntegerAttr(idxTy, extent - 1));
- mlir::Value step = mlir::arith::ConstantOp::create(
- builder, loc, idxTy, builder.getIntegerAttr(idxTy, 1));
- auto loop = fir::DoLoopOp::create(builder, loc, lb, ub, step,
- /*unordered=*/false);
- builder.setInsertionPointToStart(loop.getBody());
- loops.push_back(loop);
- ivs.push_back(loop.getInductionVar());
- }
- } else if (allConstantBound) {
- // Use the constant bound directly in the combiner region so they do not
- // need to be passed as block argument.
- assert(!bounds.empty() &&
- "seq type with constant bounds cannot have empty bounds");
- for (auto bound : llvm::reverse(bounds)) {
- auto dataBound =
- mlir::dyn_cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
- llvm::SmallVector<mlir::Value> values =
- genConstantBounds(builder, loc, dataBound);
- auto loop =
- fir::DoLoopOp::create(builder, loc, values[0], values[1], values[2],
- /*unordered=*/false);
- builder.setInsertionPointToStart(loop.getBody());
- loops.push_back(loop);
- ivs.push_back(loop.getInductionVar());
- }
- } else {
- // Lowerbound, upperbound and step are passed as block arguments.
- unsigned nbRangeArgs =
- recipe.getCombinerRegion().getArguments().size() - 2;
- assert((nbRangeArgs / 3 == seqTy.getDimension()) &&
- "Expect 3 block arguments per dimension");
- for (int i = nbRangeArgs - 1; i >= 2; i -= 3) {
- mlir::Value lb = recipe.getCombinerRegion().getArgument(i);
- mlir::Value ub = recipe.getCombinerRegion().getArgument(i + 1);
- mlir::Value step = recipe.getCombinerRegion().getArgument(i + 2);
- auto loop = fir::DoLoopOp::create(builder, loc, lb, ub, step,
- /*unordered=*/false);
- builder.setInsertionPointToStart(loop.getBody());
- loops.push_back(loop);
- ivs.push_back(loop.getInductionVar());
- }
- }
- llvm::SmallVector<mlir::Value> reversedIvs(ivs.rbegin(), ivs.rend());
- auto addr1 =
- fir::CoordinateOp::create(builder, loc, refTy, value1, reversedIvs);
- auto addr2 =
- fir::CoordinateOp::create(builder, loc, refTy, value2, reversedIvs);
- auto load1 = fir::LoadOp::create(builder, loc, addr1);
- auto load2 = fir::LoadOp::create(builder, loc, addr2);
- mlir::Value res =
- genScalarCombiner(builder, loc, op, seqTy.getEleTy(), load1, load2);
- fir::StoreOp::create(builder, loc, res, addr1);
- builder.setInsertionPointAfter(loops[0]);
- } else if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) {
- mlir::Type innerTy = fir::unwrapRefType(boxTy.getEleTy());
- if (fir::isa_trivial(innerTy)) {
- mlir::Value boxAddr1 = value1, boxAddr2 = value2;
- if (fir::isBoxAddress(boxAddr1.getType()))
- boxAddr1 = fir::LoadOp::create(builder, loc, boxAddr1);
- if (fir::isBoxAddress(boxAddr2.getType()))
- boxAddr2 = fir::LoadOp::create(builder, loc, boxAddr2);
- boxAddr1 = fir::BoxAddrOp::create(builder, loc, boxAddr1);
- boxAddr2 = fir::BoxAddrOp::create(builder, loc, boxAddr2);
- auto leftEntity = hlfir::Entity{boxAddr1};
- auto rightEntity = hlfir::Entity{boxAddr2};
-
- auto leftVal = hlfir::loadTrivialScalar(loc, builder, leftEntity);
- auto rightVal = hlfir::loadTrivialScalar(loc, builder, rightEntity);
- mlir::Value res =
- genScalarCombiner(builder, loc, op, innerTy, leftVal, rightVal);
- hlfir::AssignOp::create(builder, loc, res, boxAddr1);
- } else {
- mlir::Type innerTy = fir::extractSequenceType(boxTy);
- fir::SequenceType seqTy =
- mlir::dyn_cast_or_null<fir::SequenceType>(innerTy);
- if (!seqTy)
- TODO(loc, "Unsupported boxed type in OpenACC reduction combiner");
-
- auto shape =
- genShapeFromBoundsOrArgs(loc, builder, seqTy, bounds,
- recipe.getCombinerRegion().getArguments());
- hlfir::DesignateOp::Subscripts triplets =
- getSubscriptsFromArgs(recipe.getCombinerRegion().getArguments());
- auto leftEntity = hlfir::Entity{value1};
- if (fir::isBoxAddress(value1.getType()))
- leftEntity = hlfir::Entity{
- fir::LoadOp::create(builder, loc, value1).getResult()};
- auto left =
- genDesignateWithTriplets(builder, loc, leftEntity, triplets, shape);
- auto rightEntity = hlfir::Entity{value2};
- if (fir::isBoxAddress(value2.getType()))
- rightEntity = hlfir::Entity{
- fir::LoadOp::create(builder, loc, value2).getResult()};
- auto right =
- genDesignateWithTriplets(builder, loc, rightEntity, triplets, shape);
-
- llvm::SmallVector<mlir::Value, 1> typeParams;
- auto genKernel = [&builder, &loc, op, seqTy, &left, &right](
- mlir::Location l, fir::FirOpBuilder &b,
- mlir::ValueRange oneBasedIndices) -> hlfir::Entity {
- auto leftElement = hlfir::getElementAt(l, b, left, oneBasedIndices);
- auto rightElement = hlfir::getElementAt(l, b, right, oneBasedIndices);
- auto leftVal = hlfir::loadTrivialScalar(l, b, leftElement);
- auto rightVal = hlfir::loadTrivialScalar(l, b, rightElement);
- return hlfir::Entity{genScalarCombiner(
- builder, loc, op, seqTy.getEleTy(), leftVal, rightVal)};
- };
- mlir::Value elemental = hlfir::genElementalOp(
- loc, builder, seqTy.getEleTy(), shape, typeParams, genKernel,
- /*isUnordered=*/true);
- hlfir::AssignOp::create(builder, loc, elemental, value1);
- }
- } else {
- mlir::Value res = genScalarCombiner(builder, loc, op, ty, value1, value2);
- fir::StoreOp::create(builder, loc, res, value1);
- }
-}
-
mlir::acc::ReductionRecipeOp Fortran::lower::createOrGetReductionRecipe(
fir::FirOpBuilder &builder, llvm::StringRef recipeName, mlir::Location loc,
mlir::Type ty, mlir::acc::ReductionOperator op,
@@ -1819,37 +1579,33 @@ mlir::acc::ReductionRecipeOp Fortran::lower::createOrGetReductionRecipe(
if (auto recipe = mod.lookupSymbol<mlir::acc::ReductionRecipeOp>(recipeName))
return recipe;
- auto ip = builder.saveInsertionPoint();
-
+ mlir::OpBuilder::InsertionGuard guard(builder);
auto recipe = genRecipeOp<mlir::acc::ReductionRecipeOp>(
builder, mod, recipeName, loc, ty, op);
-
- // The two first block arguments are the two values to be combined.
- // The next arguments are the iteration ranges (lb, ub, step) to be used
- // for the combiner if needed.
- llvm::SmallVector<mlir::Type> argsTy{ty, ty};
- llvm::SmallVector<mlir::Location> argsLoc{loc, loc};
- bool allConstantBound = areAllBoundConstant(bounds);
- if (!allConstantBound) {
- for (mlir::Value bound : llvm::reverse(bounds)) {
- auto dataBound =
- mlir::dyn_cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
- argsTy.push_back(dataBound.getLowerbound().getType());
- argsLoc.push_back(dataBound.getLowerbound().getLoc());
- argsTy.push_back(dataBound.getUpperbound().getType());
- argsLoc.push_back(dataBound.getUpperbound().getLoc());
- argsTy.push_back(dataBound.getStartIdx().getType());
- argsLoc.push_back(dataBound.getStartIdx().getLoc());
- }
- }
- builder.createBlock(&recipe.getCombinerRegion(),
- recipe.getCombinerRegion().end(), argsTy, argsLoc);
- builder.setInsertionPointToEnd(&recipe.getCombinerRegion().back());
- mlir::Value v1 = recipe.getCombinerRegion().front().getArgument(0);
- mlir::Value v2 = recipe.getCombinerRegion().front().getArgument(1);
- genCombiner(builder, loc, op, ty, v1, v2, recipe, bounds, allConstantBound);
- mlir::acc::YieldOp::create(builder, loc, v1);
- builder.restoreInsertionPoint(ip);
+ bool allConstantBound = fir::acc::areAllBoundsConstant(bounds);
+
+ auto [dest, src] = genRecipeCombinerOrCopyRegion(
+ builder, loc, ty, recipe.getCombinerRegion(), bounds, allConstantBound);
+ // Generate loops that combine and assign the inputs into dest (or array
+ // section of the inputs when there are bounds).
+ hlfir::Entity srcSection = src;
+ hlfir::Entity destSection = dest;
+ if (!bounds.empty())
+ std::tie(srcSection, destSection) = genArraySectionsInRecipe(
+ builder, loc, bounds, recipe.getCombinerRegion().getArguments(),
+ allConstantBound, srcSection, destSection);
+
+ mlir::Type elementType = fir::getFortranElementType(ty);
+ auto genKernel = [&](mlir::Location l, fir::FirOpBuilder &b,
+ hlfir::Entity srcElementValue,
+ hlfir::Entity destElementValue) -> hlfir::Entity {
+ return hlfir::Entity{genScalarCombiner(builder, loc, op, elementType,
+ srcElementValue, destElementValue)};
+ };
+ hlfir::genNoAliasAssignment(loc, builder, srcSection, destSection,
+ /*emitWorkshareLoop=*/false,
+ /*temporaryLHS=*/false, genKernel);
+ mlir::acc::YieldOp::create(builder, loc, dest);
return recipe;
}
@@ -1911,15 +1667,12 @@ genReductions(const Fortran::parser::AccObjectListWithReduction &objectList,
mlir::acc::DataClause::acc_reduction, info.addr.getType(), async,
asyncDeviceTypes, asyncOnlyDeviceTypes, /*unwrapBoxAddr=*/true);
mlir::Type ty = op.getAccVar().getType();
- if (!areAllBoundConstant(bounds) ||
+ if (!fir::acc::areAllBoundsConstant(bounds) ||
fir::isAssumedShape(info.addr.getType()) ||
fir::isAllocatableOrPointerArray(info.addr.getType()))
ty = info.addr.getType();
- std::string suffix =
- areAllBoundConstant(bounds) ? getBoundsString(bounds) : "";
- std::string recipeName = fir::getTypeAsString(
- ty, converter.getKindMap(),
- ("reduction_" + stringifyReductionOperator(mlirOp)).str() + suffix);
+ std::string recipeName = fir::acc::getRecipeName(
+ mlir::acc::RecipeKind::reduction_recipe, ty, info.addr, bounds, mlirOp);
mlir::acc::ReductionRecipeOp recipe =
Fortran::lower::createOrGetReductionRecipe(
@@ -2164,9 +1917,8 @@ static void privatizeIv(
}
if (privateOp == nullptr) {
- std::string recipeName =
- fir::getTypeAsString(ivValue.getType(), converter.getKindMap(),
- Fortran::lower::privatizationRecipePrefix);
+ std::string recipeName = fir::acc::getRecipeName(
+ mlir::acc::RecipeKind::private_recipe, ivValue.getType(), ivValue, {});
auto recipe = Fortran::lower::createOrGetPrivateRecipe(
builder, recipeName, loc, ivValue.getType());
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 93dfc57..7b69b7d 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -18,6 +18,7 @@
#include "flang/Optimizer/Builder/Todo.h"
#include "flang/Optimizer/Dialect/FIRType.h"
#include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/OpenMP/Passes.h"
#include "mlir/IR/IRMapping.h"
#include "mlir/Support/LLVM.h"
#include "llvm/ADT/TypeSwitch.h"
@@ -1392,6 +1393,66 @@ bool hlfir::elementalOpMustProduceTemp(hlfir::ElementalOp elemental) {
return false;
}
+static void combineAndStoreElement(
+ mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity lhs,
+ hlfir::Entity rhs, bool temporaryLHS,
+ std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
+ hlfir::Entity, hlfir::Entity)> *combiner) {
+ hlfir::Entity valueToAssign = hlfir::loadTrivialScalar(loc, builder, rhs);
+ if (combiner) {
+ hlfir::Entity lhsValue = hlfir::loadTrivialScalar(loc, builder, lhs);
+ valueToAssign = (*combiner)(loc, builder, lhsValue, valueToAssign);
+ }
+ hlfir::AssignOp::create(builder, loc, valueToAssign, lhs,
+ /*realloc=*/false,
+ /*keep_lhs_length_if_realloc=*/false,
+ /*temporary_lhs=*/temporaryLHS);
+}
+
+void hlfir::genNoAliasArrayAssignment(
+ mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
+ hlfir::Entity lhs, bool emitWorkshareLoop, bool temporaryLHS,
+ std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
+ hlfir::Entity, hlfir::Entity)> *combiner) {
+ mlir::OpBuilder::InsertionGuard guard(builder);
+ rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs);
+ lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs);
+ mlir::Value lhsShape = hlfir::genShape(loc, builder, lhs);
+ llvm::SmallVector<mlir::Value> lhsExtents =
+ hlfir::getIndexExtents(loc, builder, lhsShape);
+ mlir::Value rhsShape = hlfir::genShape(loc, builder, rhs);
+ llvm::SmallVector<mlir::Value> rhsExtents =
+ hlfir::getIndexExtents(loc, builder, rhsShape);
+ llvm::SmallVector<mlir::Value> extents =
+ fir::factory::deduceOptimalExtents(lhsExtents, rhsExtents);
+ hlfir::LoopNest loopNest =
+ hlfir::genLoopNest(loc, builder, extents,
+ /*isUnordered=*/true, emitWorkshareLoop);
+ builder.setInsertionPointToStart(loopNest.body);
+ auto rhsArrayElement =
+ hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices);
+ rhsArrayElement = hlfir::loadTrivialScalar(loc, builder, rhsArrayElement);
+ auto lhsArrayElement =
+ hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices);
+ combineAndStoreElement(loc, builder, lhsArrayElement, rhsArrayElement,
+ temporaryLHS, combiner);
+}
+
+void hlfir::genNoAliasAssignment(
+ mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
+ hlfir::Entity lhs, bool emitWorkshareLoop, bool temporaryLHS,
+ std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
+ hlfir::Entity, hlfir::Entity)> *combiner) {
+ if (lhs.isArray()) {
+ genNoAliasArrayAssignment(loc, builder, rhs, lhs, emitWorkshareLoop,
+ temporaryLHS, combiner);
+ return;
+ }
+ rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs);
+ lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs);
+ combineAndStoreElement(loc, builder, lhs, rhs, temporaryLHS, combiner);
+}
+
std::pair<hlfir::Entity, bool>
hlfir::createTempFromMold(mlir::Location loc, fir::FirOpBuilder &builder,
hlfir::Entity mold) {
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
index 86d3974..1fc592c 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
@@ -107,26 +107,8 @@ public:
mlir::Location loc = assign->getLoc();
fir::FirOpBuilder builder(rewriter, assign.getOperation());
builder.setInsertionPoint(assign);
- rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs);
- lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs);
- mlir::Value lhsShape = hlfir::genShape(loc, builder, lhs);
- llvm::SmallVector<mlir::Value> lhsExtents =
- hlfir::getIndexExtents(loc, builder, lhsShape);
- mlir::Value rhsShape = hlfir::genShape(loc, builder, rhs);
- llvm::SmallVector<mlir::Value> rhsExtents =
- hlfir::getIndexExtents(loc, builder, rhsShape);
- llvm::SmallVector<mlir::Value> extents =
- fir::factory::deduceOptimalExtents(lhsExtents, rhsExtents);
- hlfir::LoopNest loopNest =
- hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true,
- flangomp::shouldUseWorkshareLowering(assign));
- builder.setInsertionPointToStart(loopNest.body);
- auto rhsArrayElement =
- hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices);
- rhsArrayElement = hlfir::loadTrivialScalar(loc, builder, rhsArrayElement);
- auto lhsArrayElement =
- hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices);
- hlfir::AssignOp::create(builder, loc, rhsArrayElement, lhsArrayElement);
+ hlfir::genNoAliasArrayAssignment(
+ loc, builder, rhs, lhs, flangomp::shouldUseWorkshareLowering(assign));
rewriter.eraseOp(assign);
return mlir::success();
}
diff --git a/flang/lib/Optimizer/OpenACC/Analysis/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Analysis/CMakeLists.txt
new file mode 100644
index 0000000..e05d145
--- /dev/null
+++ b/flang/lib/Optimizer/OpenACC/Analysis/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_flang_library(FIROpenACCAnalysis
+ FIROpenACCSupportAnalysis.cpp
+
+ DEPENDS
+ FIRAnalysis
+ FIRDialect
+ FIROpenACCSupport
+ HLFIRDialect
+
+ LINK_LIBS
+ FIRAnalysis
+ FIRDialect
+ FIROpenACCSupport
+ HLFIRDialect
+
+ MLIR_DEPS
+ MLIROpenACCDialect
+
+ MLIR_LIBS
+ MLIROpenACCDialect
+)
+
diff --git a/flang/lib/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.cpp b/flang/lib/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.cpp
new file mode 100644
index 0000000..8cdbe1d
--- /dev/null
+++ b/flang/lib/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.cpp
@@ -0,0 +1,40 @@
+//===- FIROpenACCSupportAnalysis.cpp - FIR OpenACCSupport Analysis -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the FIR-specific OpenACCSupport analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h"
+#include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h"
+
+using namespace mlir;
+
+namespace fir {
+namespace acc {
+
+std::string FIROpenACCSupportAnalysis::getVariableName(Value v) {
+ return fir::acc::getVariableName(v, /*preferDemangledName=*/true);
+}
+
+std::string FIROpenACCSupportAnalysis::getRecipeName(mlir::acc::RecipeKind kind,
+ Type type, Value var) {
+ return fir::acc::getRecipeName(kind, type, var);
+}
+
+mlir::InFlightDiagnostic
+FIROpenACCSupportAnalysis::emitNYI(Location loc, const Twine &message) {
+ TODO(loc, message);
+ // Should be unreachable, but we return an actual diagnostic
+ // to satisfy the interface.
+ return mlir::emitError(loc, "not yet implemented: " + message.str());
+}
+
+} // namespace acc
+} // namespace fir
diff --git a/flang/lib/Optimizer/OpenACC/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/CMakeLists.txt
index 790b9fd..16a4025 100644
--- a/flang/lib/Optimizer/OpenACC/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenACC/CMakeLists.txt
@@ -1,2 +1,3 @@
+add_subdirectory(Analysis)
add_subdirectory(Support)
add_subdirectory(Transforms)
diff --git a/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt
index 898fb00..9c6f0ee 100644
--- a/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt
@@ -4,6 +4,7 @@ add_flang_library(FIROpenACCSupport
FIROpenACCAttributes.cpp
FIROpenACCOpsInterfaces.cpp
FIROpenACCTypeInterfaces.cpp
+ FIROpenACCUtils.cpp
RegisterOpenACCExtensions.cpp
DEPENDS
diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCUtils.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCUtils.cpp
new file mode 100644
index 0000000..e5b8123
--- /dev/null
+++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCUtils.cpp
@@ -0,0 +1,269 @@
+//===- FIROpenACCUtils.cpp - FIR OpenACC Utilities ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utility functions for FIR OpenACC support.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIROpsSupport.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Dialect/Support/FIRContext.h"
+#include "flang/Optimizer/Dialect/Support/KindMapping.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/Support/InternalNames.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+namespace fir {
+namespace acc {
+
+std::string getVariableName(Value v, bool preferDemangledName) {
+ std::string srcName;
+ std::string prefix;
+ llvm::SmallVector<std::string, 4> arrayIndices;
+ bool iterate = true;
+ mlir::Operation *defOp;
+
+ // For integer constants, no need to further iterate - print their value
+ // immediately.
+ if (v.getDefiningOp()) {
+ IntegerAttr::ValueType val;
+ if (matchPattern(v.getDefiningOp(), m_ConstantInt(&val))) {
+ llvm::raw_string_ostream os(prefix);
+ val.print(os, /*isSigned=*/true);
+ return prefix;
+ }
+ }
+
+ while (v && (defOp = v.getDefiningOp()) && iterate) {
+ iterate =
+ llvm::TypeSwitch<mlir::Operation *, bool>(defOp)
+ .Case<mlir::ViewLikeOpInterface>(
+ [&v](mlir::ViewLikeOpInterface op) {
+ v = op.getViewSource();
+ return true;
+ })
+ .Case<fir::ReboxOp>([&v](fir::ReboxOp op) {
+ v = op.getBox();
+ return true;
+ })
+ .Case<fir::EmboxOp>([&v](fir::EmboxOp op) {
+ v = op.getMemref();
+ return true;
+ })
+ .Case<fir::ConvertOp>([&v](fir::ConvertOp op) {
+ v = op.getValue();
+ return true;
+ })
+ .Case<fir::LoadOp>([&v](fir::LoadOp op) {
+ v = op.getMemref();
+ return true;
+ })
+ .Case<fir::BoxAddrOp>([&v](fir::BoxAddrOp op) {
+ // The box holds the name of the variable.
+ v = op.getVal();
+ return true;
+ })
+ .Case<fir::AddrOfOp>([&](fir::AddrOfOp op) {
+ // Only use address_of symbol if mangled name is preferred
+ if (!preferDemangledName) {
+ auto symRef = op.getSymbol();
+ srcName = symRef.getLeafReference().getValue().str();
+ }
+ return false;
+ })
+ .Case<fir::ArrayCoorOp>([&](fir::ArrayCoorOp op) {
+ v = op.getMemref();
+ for (auto coor : op.getIndices()) {
+ auto idxName = getVariableName(coor, preferDemangledName);
+ arrayIndices.push_back(idxName.empty() ? "?" : idxName);
+ }
+ return true;
+ })
+ .Case<fir::CoordinateOp>([&](fir::CoordinateOp op) {
+ std::optional<llvm::ArrayRef<int32_t>> fieldIndices =
+ op.getFieldIndices();
+ if (fieldIndices && fieldIndices->size() > 0 &&
+ (*fieldIndices)[0] != fir::CoordinateOp::kDynamicIndex) {
+ int fieldId = (*fieldIndices)[0];
+ mlir::Type baseType =
+ fir::getFortranElementType(op.getRef().getType());
+ if (auto recType = llvm::dyn_cast<fir::RecordType>(baseType)) {
+ srcName = recType.getTypeList()[fieldId].first;
+ }
+ }
+ if (!srcName.empty()) {
+ // If the field name is known - attempt to continue building
+ // name by looking at its parents.
+ prefix =
+ getVariableName(op.getRef(), preferDemangledName) + "%";
+ }
+ return false;
+ })
+ .Case<hlfir::DesignateOp>([&](hlfir::DesignateOp op) {
+ if (op.getComponent()) {
+ srcName = op.getComponent().value().str();
+ prefix =
+ getVariableName(op.getMemref(), preferDemangledName) + "%";
+ return false;
+ }
+ for (auto coor : op.getIndices()) {
+ auto idxName = getVariableName(coor, preferDemangledName);
+ arrayIndices.push_back(idxName.empty() ? "?" : idxName);
+ }
+ v = op.getMemref();
+ return true;
+ })
+ .Case<fir::DeclareOp, hlfir::DeclareOp>([&](auto op) {
+ srcName = op.getUniqName().str();
+ return false;
+ })
+ .Case<fir::AllocaOp>([&](fir::AllocaOp op) {
+ if (preferDemangledName) {
+ // Prefer demangled name (bindc_name over uniq_name)
+ srcName = op.getBindcName() ? *op.getBindcName()
+ : op.getUniqName() ? *op.getUniqName()
+ : "";
+ } else {
+ // Prefer mangled name (uniq_name over bindc_name)
+ srcName = op.getUniqName() ? *op.getUniqName()
+ : op.getBindcName() ? *op.getBindcName()
+ : "";
+ }
+ return false;
+ })
+ .Default([](mlir::Operation *) { return false; });
+ }
+
+ // Fallback to the default implementation.
+ if (srcName.empty())
+ return acc::getVariableName(v);
+
+ // Build array index suffix if present
+ std::string suffix;
+ if (!arrayIndices.empty()) {
+ llvm::raw_string_ostream os(suffix);
+ os << "(";
+ llvm::interleaveComma(arrayIndices, os);
+ os << ")";
+ }
+
+ // Names from FIR operations may be mangled.
+ // When the demangled name is requested - demangle it.
+ if (preferDemangledName) {
+ auto [kind, deconstructed] = fir::NameUniquer::deconstruct(srcName);
+ if (kind != fir::NameUniquer::NameKind::NOT_UNIQUED)
+ return prefix + deconstructed.name + suffix;
+ }
+
+ return prefix + srcName + suffix;
+}
+
+bool areAllBoundsConstant(llvm::ArrayRef<Value> bounds) {
+ for (auto bound : bounds) {
+ auto dataBound =
+ mlir::dyn_cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
+ if (!dataBound)
+ return false;
+
+ // Check if this bound has constant values
+ bool hasConstant = false;
+ if (dataBound.getLowerbound() && dataBound.getUpperbound())
+ hasConstant =
+ fir::getIntIfConstant(dataBound.getLowerbound()).has_value() &&
+ fir::getIntIfConstant(dataBound.getUpperbound()).has_value();
+ else if (dataBound.getExtent())
+ hasConstant = fir::getIntIfConstant(dataBound.getExtent()).has_value();
+
+ if (!hasConstant)
+ return false;
+ }
+ return true;
+}
+
+static std::string getBoundsString(llvm::ArrayRef<Value> bounds) {
+ if (bounds.empty())
+ return "";
+
+ std::string boundStr;
+ llvm::raw_string_ostream os(boundStr);
+ os << "_section_";
+
+ llvm::interleave(
+ bounds,
+ [&](Value bound) {
+ auto boundsOp =
+ mlir::cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
+ if (boundsOp.getLowerbound() &&
+ fir::getIntIfConstant(boundsOp.getLowerbound()) &&
+ boundsOp.getUpperbound() &&
+ fir::getIntIfConstant(boundsOp.getUpperbound())) {
+ os << "lb" << *fir::getIntIfConstant(boundsOp.getLowerbound())
+ << ".ub" << *fir::getIntIfConstant(boundsOp.getUpperbound());
+ } else if (boundsOp.getExtent() &&
+ fir::getIntIfConstant(boundsOp.getExtent())) {
+ os << "ext" << *fir::getIntIfConstant(boundsOp.getExtent());
+ } else {
+ os << "?";
+ }
+ },
+ [&] { os << "x"; });
+
+ return os.str();
+}
+
+std::string getRecipeName(mlir::acc::RecipeKind kind, Type type, Value var,
+ llvm::ArrayRef<Value> bounds,
+ mlir::acc::ReductionOperator reductionOp) {
+ assert(fir::isa_fir_type(type) && "getRecipeName expects a FIR type");
+
+ // Build the complete prefix with all components before calling
+ // getTypeAsString
+ std::string prefixStr;
+ llvm::raw_string_ostream prefixOS(prefixStr);
+
+ switch (kind) {
+ case mlir::acc::RecipeKind::private_recipe:
+ prefixOS << "privatization";
+ // Private recipes do not currently include bounds in the name
+ // TODO: They should include them - but lowering tests would need to
+ // be updated.
+ break;
+ case mlir::acc::RecipeKind::firstprivate_recipe:
+ prefixOS << "firstprivatization";
+ // Add bounds to the prefix if applicable (only for firstprivate)
+ if (!bounds.empty() && areAllBoundsConstant(bounds))
+ prefixOS << getBoundsString(bounds);
+ break;
+ case mlir::acc::RecipeKind::reduction_recipe:
+ prefixOS << "reduction";
+ // Embed the reduction operator in the prefix
+ if (reductionOp != mlir::acc::ReductionOperator::AccNone)
+ prefixOS << "_"
+ << mlir::acc::stringifyReductionOperator(reductionOp).str();
+ // Add bounds to the prefix if applicable (only for reduction)
+ if (!bounds.empty() && areAllBoundsConstant(bounds))
+ prefixOS << getBoundsString(bounds);
+ break;
+ }
+
+ auto kindMap = var && var.getDefiningOp()
+ ? fir::getKindMapping(var.getDefiningOp())
+ : fir::KindMapping(type.getContext());
+ return fir::getTypeAsString(type, kindMap, prefixOS.str());
+}
+
+} // namespace acc
+} // namespace fir
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/ACCInitializeFIRAnalyses.cpp b/flang/lib/Optimizer/OpenACC/Transforms/ACCInitializeFIRAnalyses.cpp
new file mode 100644
index 0000000..679b29b
--- /dev/null
+++ b/flang/lib/Optimizer/OpenACC/Transforms/ACCInitializeFIRAnalyses.cpp
@@ -0,0 +1,56 @@
+//===- ACCInitializeFIRAnalyses.cpp - Initialize FIR analyses ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass initializes analyses that can be reused by subsequent OpenACC
+// passes in the pipeline.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Analysis/AliasAnalysis.h"
+#include "flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h"
+#include "flang/Optimizer/OpenACC/Passes.h"
+#include "mlir/Analysis/AliasAnalysis.h"
+#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h"
+
+namespace fir {
+namespace acc {
+#define GEN_PASS_DEF_ACCINITIALIZEFIRANALYSES
+#include "flang/Optimizer/OpenACC/Passes.h.inc"
+} // namespace acc
+} // namespace fir
+
+#define DEBUG_TYPE "acc-initialize-fir-analyses"
+
+namespace {
+
+/// This pass initializes analyses for reuse by subsequent OpenACC passes in the
+/// pipeline. It creates and caches analyses like OpenACCSupport so they can be
+/// retrieved by later passes using getAnalysis() or getCachedAnalysis().
+class ACCInitializeFIRAnalysesPass
+ : public fir::acc::impl::ACCInitializeFIRAnalysesBase<
+ ACCInitializeFIRAnalysesPass> {
+public:
+ void runOnOperation() override {
+ // Initialize OpenACCSupport with FIR-specific implementation.
+ auto &openACCSupport = getAnalysis<mlir::acc::OpenACCSupport>();
+ openACCSupport.setImplementation(fir::acc::FIROpenACCSupportAnalysis());
+
+ // Initialize AliasAnalysis with FIR-specific implementation.
+ auto &aliasAnalysis = getAnalysis<mlir::AliasAnalysis>();
+ aliasAnalysis.addAnalysisImplementation(fir::AliasAnalysis());
+
+ // Mark all analyses as preserved since this pass only initializes them
+ markAllAnalysesPreserved();
+ }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> fir::acc::createACCInitializeFIRAnalysesPass() {
+ return std::make_unique<ACCInitializeFIRAnalysesPass>();
+}
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
index ed177ba..35aa87d 100644
--- a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
@@ -1,11 +1,15 @@
add_flang_library(FIROpenACCTransforms
+ ACCInitializeFIRAnalyses.cpp
ACCRecipeBufferization.cpp
DEPENDS
FIROpenACCPassesIncGen
LINK_LIBS
+ FIRAnalysis
FIRDialect
+ FIROpenACCAnalysis
+ HLFIRDialect
MLIR_LIBS
MLIRIR
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index e1e6125..8019c39 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -718,6 +718,31 @@ DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr,
return convertRecordType(recTy, fileAttr, scope, declOp);
} else if (auto tupleTy = mlir::dyn_cast_if_present<mlir::TupleType>(Ty)) {
return convertTupleType(tupleTy, fileAttr, scope, declOp);
+ } else if (mlir::isa<mlir::FunctionType>(Ty)) {
+ // Handle function types - these represent procedure pointers after the
+ // BoxedProcedure pass has run and unwrapped the fir.boxproc type, as well
+ // as dummy procedures (which are represented as function types in FIR)
+ llvm::SmallVector<mlir::LLVM::DITypeAttr> types;
+
+ auto funcTy = mlir::cast<mlir::FunctionType>(Ty);
+ // Add return type (or void if no return type)
+ if (funcTy.getNumResults() == 0)
+ types.push_back(mlir::LLVM::DINullTypeAttr::get(context));
+ else
+ types.push_back(
+ convertType(funcTy.getResult(0), fileAttr, scope, declOp));
+
+ for (mlir::Type paramTy : funcTy.getInputs())
+ types.push_back(convertType(paramTy, fileAttr, scope, declOp));
+
+ auto subroutineTy = mlir::LLVM::DISubroutineTypeAttr::get(
+ context, /*callingConvention=*/0, types);
+
+ return mlir::LLVM::DIDerivedTypeAttr::get(
+ context, llvm::dwarf::DW_TAG_pointer_type,
+ mlir::StringAttr::get(context, ""), subroutineTy,
+ /*sizeInBits=*/ptrSize * 8, /*alignInBits=*/0, /*offset=*/0,
+ /*optional<address space>=*/std::nullopt, /*extra data=*/nullptr);
} else if (auto refTy = mlir::dyn_cast_if_present<fir::ReferenceType>(Ty)) {
auto elTy = refTy.getEleTy();
return convertPointerLikeType(elTy, fileAttr, scope, declOp,
diff --git a/flang/test/Driver/multiple-actions-error.f95 b/flang/test/Driver/multiple-actions-error.f95
index 5ec4e91..3b2b7dc 100644
--- a/flang/test/Driver/multiple-actions-error.f95
+++ b/flang/test/Driver/multiple-actions-error.f95
@@ -1,8 +1,30 @@
-! Verify that the frontend driver error-out if multiple actions are specified
-
-! RUN: not %flang_fc1 -E -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix=ERROR
-! RUN: not %flang_fc1 -fsyntax-only -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix=ERROR
-
-! ERROR: error: Only one action option is allowed
-
-end progream
+! Verify that the frontend driver raises the expected error when multiple
+! actions are specified.
+!
+! RUN: not %flang_fc1 -fsyntax-only -fsyntax-only %s 2>&1 \
+! RUN: | FileCheck %s --check-prefixes=ERROR,ACTIONS-1
+!
+! RUN: not %flang_fc1 -E -fsyntax-only %s 2>&1 \
+! RUN: | FileCheck %s --check-prefixes=ERROR,ACTIONS-2
+!
+! RUN: not %flang_fc1 -fsyntax-only -E -emit-llvm %s 2>&1 \
+! RUN: | FileCheck %s --check-prefixes=ERROR,ACTIONS-3
+!
+! If one or more options are specified with -Xflang, they will appear last in
+! the error message.
+!
+! RUN: not %flang -S -Xflang -emit-llvm %s 2>&1 \
+! RUN: | FileCheck %s --check-prefixes=ERROR,ACTIONS-4
+!
+! RUN: not %flang -Xflang -emit-llvm -S %s 2>&1 \
+! RUN: | FileCheck %s --check-prefixes=ERROR,ACTIONS-4
+!
+! RUN: not %flang -Xflang -emit-obj -S -Xflang -emit-llvm %s 2>&1 \
+! RUN: | FileCheck %s --check-prefixes=ERROR,ACTIONS-5
+!
+! ERROR: error: only one action option is allowed.
+! ACTIONS-1: Got '-fsyntax-only', '-fsyntax-only'
+! ACTIONS-2: Got '-E', '-fsyntax-only'
+! ACTIONS-3: Got '-fsyntax-only', '-E', '-emit-llvm'
+! ACTIONS-4: Got '-S', '-emit-llvm'
+! ACTIONS-5: Got '-S', '-emit-obj', '-emit-llvm'
diff --git a/flang/test/Integration/debug-proc-ptr-e2e.f90 b/flang/test/Integration/debug-proc-ptr-e2e.f90
new file mode 100644
index 0000000..aa89160
--- /dev/null
+++ b/flang/test/Integration/debug-proc-ptr-e2e.f90
@@ -0,0 +1,26 @@
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s
+
+program test_proc_ptr
+ implicit none
+ procedure(fun1), pointer :: fun_ptr
+
+ fun_ptr => fun1
+ print *, fun_ptr(3)
+
+contains
+ integer function fun1(x)
+ integer :: x
+ fun1 = x + 1
+ end function fun1
+end program test_proc_ptr
+
+! Check that fun_ptr is declared with correct type
+! CHECK-DAG: ![[INT:.*]] = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed)
+! CHECK-DAG: ![[PTR_INT:.*]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[INT]], size: 64)
+
+! Check that fun_ptr variable is a pointer to a subroutine type
+! The order is: DILocalVariable -> pointer type -> subroutine type -> {return, params}
+! CHECK-DAG: ![[FUN_PTR_VAR:.*]] = !DILocalVariable(name: "fun_ptr", {{.*}}type: ![[PROC_PTR:[0-9]+]]
+! CHECK-DAG: ![[PROC_PTR]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[SUBR_TYPE:[0-9]+]], size: 64)
+! CHECK-DAG: ![[SUBR_TYPE]] = !DISubroutineType(types: ![[SUBR_TYPES:[0-9]+]])
+! CHECK-DAG: ![[SUBR_TYPES]] = !{![[INT]], ![[PTR_INT]]}
diff --git a/flang/test/Lower/OpenACC/acc-private.f90 b/flang/test/Lower/OpenACC/acc-private.f90
index 485825d..910e87f 100644
--- a/flang/test/Lower/OpenACC/acc-private.f90
+++ b/flang/test/Lower/OpenACC/acc-private.f90
@@ -21,10 +21,7 @@
! CHECK: acc.yield %[[DECL]]#0 : !fir.box<!fir.array<?x?x2xi32>>
! CHECK: } copy {
! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?x?x2xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?x?x2xi32>>):
-! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}}, %{{.*}}, %{{.*}} : (index, index, index) -> !fir.shape<3>
-! CHECK: %[[DES_SRC:.*]] = hlfir.designate %[[ARG0]] shape %[[SHAPE]] : (!fir.box<!fir.array<?x?x2xi32>>, !fir.shape<3>) -> !fir.box<!fir.array<?x?x2xi32>>
-! CHECK: %[[DES_DST:.*]] = hlfir.designate %[[ARG1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?x?x2xi32>>, !fir.shape<3>) -> !fir.box<!fir.array<?x?x2xi32>>
-! CHECK: hlfir.assign %[[DES_SRC]] to %[[DES_DST]] : !fir.box<!fir.array<?x?x2xi32>>, !fir.box<!fir.array<?x?x2xi32>>
+! CHECK: hlfir.assign %[[ARG0]] to %[[ARG1]] temporary_lhs : !fir.box<!fir.array<?x?x2xi32>>, !fir.box<!fir.array<?x?x2xi32>>
! CHECK: acc.terminator
! CHECK: } destroy {
! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?x?x2xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?x?x2xi32>>):
@@ -38,20 +35,7 @@
! CHECK: ^bb0(%{{.*}}: !fir.box<!fir.array<?xi32>>):
! CHECK: } copy {
! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>):
-! CHECK: %[[LB:.*]] = arith.constant 4 : index
-! CHECK: %[[UB:.*]] = arith.constant 9 : index
-! CHECK: %[[STEP:.*]] = arith.constant 1 : index
-! CHECK: %[[C1:.*]] = arith.constant 1 : index
-! CHECK: %[[C0:.*]] = arith.constant 0 : index
-! CHECK: %[[EXT0:.*]] = arith.subi %[[UB]], %[[LB]] : index
-! CHECK: %[[EXT1:.*]] = arith.addi %[[EXT0]], %[[C1]] : index
-! CHECK: %[[EXT2:.*]] = arith.divsi %[[EXT1]], %[[STEP]] : index
-! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[EXT2]], %[[C0]] : index
-! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[EXT2]], %[[C0]] : index
-! CHECK: %[[SHAPE:.*]] = fir.shape %[[SELECT]] : (index) -> !fir.shape<1>
-! CHECK: %[[LEFT:.*]] = hlfir.designate %[[ARG0]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
-! CHECK: %[[RIGHT:.*]] = hlfir.designate %[[ARG1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
-! CHECK: hlfir.assign %[[LEFT]] to %[[RIGHT]] : !fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>
+! CHECK: hlfir.assign {{.*}} to {{.*}} temporary_lhs : !fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>
! CHECK: acc.terminator
! CHECK: } destroy {
! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>):
@@ -71,10 +55,7 @@
! CHECK: acc.yield %[[DECL]]#0 : !fir.box<!fir.array<?xi32>>
! CHECK: } copy {
! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>):
-! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK: %[[DES_V1:.*]] = hlfir.designate %[[ARG0]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
-! CHECK: %[[DES_V2:.*]] = hlfir.designate %[[ARG1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
-! CHECK: hlfir.assign %[[DES_V1]] to %[[DES_V2]] : !fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>
+! CHECK: hlfir.assign %[[ARG0]] to %[[ARG1]] temporary_lhs : !fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>
! CHECK: acc.terminator
! CHECK: } destroy {
! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>):
@@ -183,12 +164,19 @@
! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<50xf32>>
! CHECK: } copy {
! CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<!fir.array<50xf32>>, %[[DST:.*]]: !fir.ref<!fir.array<50xf32>>):
-! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK: %[[DECL_SRC:.*]]:2 = hlfir.declare %[[SRC]](%[[SHAPE]]) {uniq_name = ""} : (!fir.ref<!fir.array<50xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<50xf32>>, !fir.ref<!fir.array<50xf32>>)
-! CHECK: %[[DECL_DST:.*]]:2 = hlfir.declare %[[DST]](%[[SHAPE]]) {uniq_name = ""} : (!fir.ref<!fir.array<50xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<50xf32>>, !fir.ref<!fir.array<50xf32>>)
-! CHECK: %[[DES_SRC:.*]] = hlfir.designate %[[DECL_SRC]]#0 shape %[[SHAPE:.*]] : (!fir.ref<!fir.array<50xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<50xf32>>
-! CHECK: %[[DES_DST:.*]] = hlfir.designate %[[DECL_DST]]#0 shape %[[SHAPE:.*]] : (!fir.ref<!fir.array<50xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<50xf32>>
-! CHECK: hlfir.assign %[[DES_SRC]] to %[[DES_DST]] : !fir.ref<!fir.array<50xf32>>, !fir.ref<!fir.array<50xf32>>
+! CHECK: %[[C50:.*]] = arith.constant 50 : index
+! CHECK: %[[C99:.*]] = arith.constant 99 : index
+! CHECK: %[[C1:.*]] = arith.constant 1 : index
+! CHECK: %[[C0:.*]] = arith.constant 0 : index
+! CHECK: %[[D0:.*]] = arith.subi %[[C99]], %[[C50]] : index
+! CHECK: %[[D1:.*]] = arith.addi %[[D0]], %[[C1]] : index
+! CHECK: %[[D2:.*]] = arith.divsi %[[D1]], %[[C1]] : index
+! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[D2]], %[[C0]] : index
+! CHECK: %[[SEL:.*]] = arith.select %[[CMP]], %[[D2]], %[[C0]] : index
+! CHECK: %[[SH:.*]] = fir.shape %[[SEL]] : (index) -> !fir.shape<1>
+! CHECK: %[[SEC_SRC:.*]] = hlfir.designate %[[SRC]] (%c51{{.*}}:%c100{{.*}}:%c1{{.*}}) shape %[[SH]] : (!fir.ref<!fir.array<50xf32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<50xf32>>
+! CHECK: %[[SEC_DST:.*]] = hlfir.designate %[[DST]] (%c51{{.*}}:%c100{{.*}}:%c1{{.*}}) shape %[[SH]] : (!fir.ref<!fir.array<50xf32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<50xf32>>
+! CHECK: hlfir.assign %[[SEC_SRC]] to %[[SEC_DST]] temporary_lhs : !fir.ref<!fir.array<50xf32>>, !fir.ref<!fir.array<50xf32>>
! CHECK: acc.terminator
! CHECK: }
@@ -200,12 +188,7 @@
! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xf32>>
! CHECK: } copy {
! CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<!fir.array<100xf32>>, %[[DST:.*]]: !fir.ref<!fir.array<100xf32>>):
-! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK: %[[DECL_SRC:.*]]:2 = hlfir.declare %[[SRC]](%[[SHAPE]]) {uniq_name = ""} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
-! CHECK: %[[DECL_DST:.*]]:2 = hlfir.declare %[[DST]](%[[SHAPE]]) {uniq_name = ""} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
-! CHECK: %[[DES_SRC:.*]] = hlfir.designate %[[DECL_SRC]]#0 shape %[[SHAPE]] : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<100xf32>>
-! CHECK: %[[DES_DST:.*]] = hlfir.designate %[[DECL_DST]]#0 shape %[[SHAPE]] : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<100xf32>>
-! CHECK: hlfir.assign %[[DES_SRC]] to %[[DES_DST]] : !fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>
+! CHECK: hlfir.assign %[[SRC]] to %[[DST]] temporary_lhs : !fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>
! CHECK: acc.terminator
! CHECK: }
@@ -217,7 +200,7 @@
! CHECK: } copy {
! CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<i32>, %[[DST:.*]]: !fir.ref<i32>):
! CHECK: %[[VALUE:.*]] = fir.load %[[SRC]] : !fir.ref<i32>
-! CHECK: fir.store %[[VALUE]] to %[[DST]] : !fir.ref<i32>
+! CHECK: fir.assign %[[VALUE]] to %[[DST]] temporary_lhs : i32, !fir.ref<i32>
! CHECK: acc.terminator
! CHECK: }
diff --git a/flang/test/Lower/OpenACC/acc-reduction.f90 b/flang/test/Lower/OpenACC/acc-reduction.f90
index 6cb8bdf..aee28f0 100644
--- a/flang/test/Lower/OpenACC/acc-reduction.f90
+++ b/flang/test/Lower/OpenACC/acc-reduction.f90
@@ -2,757 +2,1212 @@
! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
-! CHECK-LABEL: acc.reduction.recipe @reduction_max_box_UxUxf32 : !fir.box<!fir.array<?x?xf32>> reduction_operator <max> init {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?x?xf32>>):
-! CHECK: %[[CST:.*]] = arith.constant -1.401300e-45 : f32
-! CHECK: %[[DIMS0:.*]]:3 = fir.box_dims %[[ARG0]], %c0{{.*}} : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
-! CHECK: %[[DIMS1:.*]]:3 = fir.box_dims %[[ARG0]], %c1 : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
-! CHECK: %[[SHAPE:.*]] = fir.shape %[[DIMS0]]#1, %[[DIMS1]]#1 : (index, index) -> !fir.shape<2>
-! CHECK: %[[TEMP:.*]] = fir.allocmem !fir.array<?x?xf32>, %[[DIMS0]]#1, %[[DIMS1]]#1 {bindc_name = ".tmp", uniq_name = ""}
-! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.heap<!fir.array<?x?xf32>>)
-! CHECK: hlfir.assign %[[CST]] to %[[DECL]]#0 : f32, !fir.box<!fir.array<?x?xf32>>
-! CHECK: acc.yield %[[DECL]]#0 : !fir.box<!fir.array<?x?xf32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[V1:.*]]: !fir.box<!fir.array<?x?xf32>>, %[[V2:.*]]: !fir.box<!fir.array<?x?xf32>>):
-! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}}, %{{.*}} : (index, index) -> !fir.shape<2>
-! CHECK: %[[DES_V1:.*]] = hlfir.designate %[[V1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?xf32>>
-! CHECK: %[[DES_V2:.*]] = hlfir.designate %[[V2]] shape %[[SHAPE]] : (!fir.box<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?xf32>>
-! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %[[SHAPE]] unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xf32> {
-! CHECK: ^bb0(%[[ARG0:.*]]: index, %[[ARG1:.*]]: index):
-! CHECK: %[[D1:.*]] = hlfir.designate %[[DES_V1]] (%[[ARG0]], %[[ARG1]]) : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
-! CHECK: %[[D2:.*]] = hlfir.designate %[[DES_V2]] (%[[ARG0]], %[[ARG1]]) : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[D1]] : !fir.ref<f32>
-! CHECK: %[[LOAD2:.*]] = fir.load %[[D2]] : !fir.ref<f32>
-! CHECK: %[[CMP:.*]] = arith.cmpf ogt, %[[LOAD1]], %[[LOAD2]] {{.*}} : f32
-! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD1]], %[[LOAD2]] : f32
-! CHECK: hlfir.yield_element %[[SELECT]] : f32
-! CHECK: }
-! CHECK: hlfir.assign %[[ELEMENTAL]] to %[[V1]] : !hlfir.expr<?x?xf32>, !fir.box<!fir.array<?x?xf32>>
-! CHECK: acc.yield %[[V1]] : !fir.box<!fir.array<?x?xf32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_box_ptr_Uxf32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> reduction_operator <max> init {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>):
-! CHECK: %[[CST:.*]] = arith.constant -1.401300e-45 : f32
-! CHECK: %[[BOX:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
-! CHECK: %[[C0:.*]] = arith.constant 0 : index
-! CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[BOX]], %[[C0]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
-! CHECK: %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
-! CHECK: %[[TEMP:.*]] = fir.allocmem !fir.array<?xf32>, %[[BOX_DIMS]]#1 {bindc_name = ".tmp", uniq_name = ""}
-! CHECK: %[[STORAGE:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
-! CHECK: %[[BOXTEMP:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xf32>>>
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[BOXTEMP]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
-! CHECK: hlfir.assign %[[CST]] to %[[DECLARE]]#0 : f32, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>):
-! CHECK: %[[BOX0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
-! CHECK: %[[C0:.*]] = arith.constant 0 : index
-! CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[BOX0]], %[[C0]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
-! CHECK: %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
-! CHECK: %[[BOX0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
-! CHECK: %[[DES_V1:.*]] = hlfir.designate %[[BOX0]] shape %[[SHAPE]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
-! CHECK: %[[BOX1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
-! CHECK: %[[DES_V2:.*]] = hlfir.designate %[[BOX1]] shape %[[SHAPE]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
-! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %[[SHAPE]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
-! CHECK: ^bb0(%[[IV:.*]]: index):
-! CHECK: %[[V1:.*]] = hlfir.designate %[[DES_V1]] (%[[IV]]) : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
-! CHECK: %[[V2:.*]] = hlfir.designate %[[DES_V2]] (%[[IV]]) : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
-! CHECK: %[[LOAD_V1:.*]] = fir.load %[[V1]] : !fir.ref<f32>
-! CHECK: %[[LOAD_V2:.*]] = fir.load %[[V2]] : !fir.ref<f32>
-! CHECK: %[[CMP:.*]] = arith.cmpf ogt, %[[LOAD_V1]], %[[LOAD_V2]] {{.*}} : f32
-! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD_V1]], %[[LOAD_V2]] : f32
-! CHECK: hlfir.yield_element %[[SELECT]] : f32
-! CHECK: }
-! CHECK: hlfir.assign %[[ELEMENTAL]] to %[[ARG0]] : !hlfir.expr<?xf32>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_box_heap_Uxf32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> reduction_operator <max> init {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>):
-! CHECK: %[[CST:.*]] = arith.constant -1.401300e-45 : f32
-! CHECK: %[[BOX:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-! CHECK: %[[C0:.*]] = arith.constant 0 : index
-! CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[BOX]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
-! CHECK: %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
-! CHECK: %[[TEMP:.*]] = fir.allocmem !fir.array<?xf32>, %[[BOX_DIMS]]#1 {bindc_name = ".tmp", uniq_name = ""}
-! CHECK: %[[STORAGE:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
-! CHECK: %[[BOXTEMP:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>>
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[BOXTEMP]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
-! CHECK: hlfir.assign %[[CST]] to %[[DECLARE]]#0 : f32, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>):
-! CHECK: %[[BOX0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-! CHECK: %[[C0:.*]] = arith.constant 0 : index
-! CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[BOX0]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
-! CHECK: %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
-! CHECK: %[[BOX0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-! CHECK: %[[DES_V1:.*]] = hlfir.designate %[[BOX0]] shape %[[SHAPE]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
-! CHECK: %[[BOX1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-! CHECK: %[[DES_V2:.*]] = hlfir.designate %[[BOX1]] shape %[[SHAPE]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
-! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %[[SHAPE]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
-! CHECK: ^bb0(%[[IV:.*]]: index):
-! CHECK: %[[V1:.*]] = hlfir.designate %[[DES_V1]] (%[[IV]]) : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
-! CHECK: %[[V2:.*]] = hlfir.designate %[[DES_V2]] (%[[IV]]) : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
-! CHECK: %[[LOAD_V1:.*]] = fir.load %[[V1]] : !fir.ref<f32>
-! CHECK: %[[LOAD_V2:.*]] = fir.load %[[V2]] : !fir.ref<f32>
-! CHECK: %[[CMP:.*]] = arith.cmpf ogt, %[[LOAD_V1]], %[[LOAD_V2]] {{.*}} : f32
-! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD_V1]], %[[LOAD_V2]] : f32
-! CHECK: hlfir.yield_element %[[SELECT]] : f32
-! CHECK: }
-! CHECK: hlfir.assign %[[ELEMENTAL]] to %[[ARG0]] : !hlfir.expr<?xf32>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_section_lb1.ub3_box_Uxi32 : !fir.box<!fir.array<?xi32>> reduction_operator <add> init {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>):
-! CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %c0{{.*}} : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-! CHECK: %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
-! CHECK: %[[TEMP:.*]] = fir.allocmem !fir.array<?xi32>, %0#1 {bindc_name = ".tmp", uniq_name = ""}
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
-! CHECK: hlfir.assign %c0{{.*}} to %[[DECLARE]]#0 : i32, !fir.box<!fir.array<?xi32>>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.box<!fir.array<?xi32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>):
-! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK: %[[DES1:.*]] = hlfir.designate %[[ARG0]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
-! CHECK: %[[DES2:.*]] = hlfir.designate %[[ARG1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
-! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %[[SHAPE]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
-! CHECK: ^bb0(%[[IV:.*]]: index):
-! CHECK: %[[DES_V1:.*]] = hlfir.designate %[[DES1]] (%[[IV]]) : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-! CHECK: %[[DES_V2:.*]] = hlfir.designate %[[DES2]] (%[[IV]]) : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-! CHECK: %[[LOAD_V1:.*]] = fir.load %[[DES_V1]] : !fir.ref<i32>
-! CHECK: %[[LOAD_V2:.*]] = fir.load %[[DES_V2]] : !fir.ref<i32>
-! CHECK: %[[COMBINED:.*]] = arith.addi %[[LOAD_V1]], %[[LOAD_V2]] : i32
-! CHECK: hlfir.yield_element %[[COMBINED]] : i32
-! CHECK: }
-! CHECK: hlfir.assign %[[ELEMENTAL]] to %[[ARG0]] : !hlfir.expr<?xi32>, !fir.box<!fir.array<?xi32>>
-! CHECK: acc.yield %[[ARG0]] : !fir.box<!fir.array<?xi32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_max_box_Uxf32 : !fir.box<!fir.array<?xf32>> reduction_operator <max> init {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xf32>>):
-! CHECK: %[[INIT_VALUE:.*]] = arith.constant -1.401300e-45 : f32
-! CHECK: %[[C0:.*]] = arith.constant 0 : index
-! CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %[[C0]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
-! CHECK: %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
-! CHECK: %[[TEMP:.*]] = fir.allocmem !fir.array<?xf32>, %0#1 {bindc_name = ".tmp", uniq_name = ""}
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
-! CHECK: hlfir.assign %[[INIT_VALUE]] to %[[DECLARE]]#0 : f32, !fir.box<!fir.array<?xf32>>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.box<!fir.array<?xf32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xf32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xf32>>
-! CHECK: %[[LEFT:.*]] = hlfir.designate %[[ARG0]] shape %{{.*}} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
-! CHECK: %[[RIGHT:.*]] = hlfir.designate %[[ARG1]] shape %{{.*}} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
-! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
-! CHECK: ^bb0(%{{.*}}: index):
-! CHECK: %[[DES_V1:.*]] = hlfir.designate %[[LEFT]] (%{{.*}}) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-! CHECK: %[[DES_V2:.*]] = hlfir.designate %[[RIGHT]] (%{{.*}}) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-! CHECK: %[[LOAD_V1:.*]] = fir.load %[[DES_V1]] : !fir.ref<f32>
-! CHECK: %[[LOAD_V2:.*]] = fir.load %[[DES_V2]] : !fir.ref<f32>
-! CHECK: %[[CMPF:.*]] = arith.cmpf ogt, %[[LOAD_V1]], %[[LOAD_V2]] {{.*}} : f32
-! CHECK: %[[SELECT:.*]] = arith.select %[[CMPF]], %[[LOAD_V1]], %[[LOAD_V2]] : f32
-! CHECK: hlfir.yield_element %[[SELECT]] : f32
-! CHECK: }
-! CHECK: hlfir.assign %[[ELEMENTAL]] to %[[ARG0]] : !hlfir.expr<?xf32>, !fir.box<!fir.array<?xf32>>
-! CHECK: acc.yield %[[ARG0]] : !fir.box<!fir.array<?xf32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_box_Uxi32 : !fir.box<!fir.array<?xi32>> reduction_operator <add> init {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>):
-! CHECK: %[[INIT_VALUE:.*]] = arith.constant 0 : i32
-! CHECK: %[[C0:.*]] = arith.constant 0 : index
-! CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %[[C0]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-! CHECK: %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
-! CHECK: %[[TEMP:.*]] = fir.allocmem !fir.array<?xi32>, %[[BOX_DIMS]]#1 {bindc_name = ".tmp", uniq_name = ""}
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
-! CHECK: hlfir.assign %[[INIT_VALUE]] to %[[DECLARE]]#0 : i32, !fir.box<!fir.array<?xi32>>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.box<!fir.array<?xi32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>
-! CHECK: %[[C0:.*]] = arith.constant 0 : index
-! CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %[[C0]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-! CHECK: %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
-! CHECK: %[[LEFT:.*]] = hlfir.designate %[[ARG0]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
-! CHECK: %[[RIGHT:.*]] = hlfir.designate %[[ARG1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
-! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
-! CHECK: ^bb0(%{{.*}}: index):
-! CHECK: %[[DES_V1:.*]] = hlfir.designate %[[LEFT]] (%{{.*}}) : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-! CHECK: %[[DES_V2:.*]] = hlfir.designate %[[RIGHT]] (%{{.*}}) : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-! CHECK: %[[LOAD_V1:.*]] = fir.load %[[DES_V1]] : !fir.ref<i32>
-! CHECK: %[[LOAD_V2:.*]] = fir.load %[[DES_V2]] : !fir.ref<i32>
-! CHECK: %[[COMBINED:.*]] = arith.addi %[[LOAD_V1]], %[[LOAD_V2]] : i32
-! CHECK: hlfir.yield_element %[[COMBINED]] : i32
-! CHECK: }
-! CHECK: hlfir.assign %[[ELEMENTAL]] to %[[ARG0]] : !hlfir.expr<?xi32>, !fir.box<!fir.array<?xi32>>
-! CHECK: acc.yield %arg0 : !fir.box<!fir.array<?xi32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_section_lb0.ub9xlb0.ub19_ref_10x20xi32 : !fir.ref<!fir.array<10x20xi32>> reduction_operator <add> init {
-! CHECK: fir.do_loop %arg1 = %c0 to %c19 step %c1 {
-! CHECK: fir.do_loop %arg2 = %c0_0 to %c9 step %c1_1 {
-! CHECK: } combiner {
-! CHECK: fir.do_loop %arg2 = %c0 to %c19 step %c1 {
-! CHECK: fir.do_loop %arg3 = %c0_0 to %c9 step %c1_1 {
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_z32 : !fir.ref<complex<f32>> reduction_operator <mul> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<complex<f32>>):
-! CHECK: %[[REAL:.*]] = arith.constant 1.000000e+00 : f32
-! CHECK: %[[IMAG:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK: %[[UNDEF:.*]] = fir.undefined complex<f32>
-! CHECK: %[[UNDEF1:.*]] = fir.insert_value %[[UNDEF]], %[[REAL]], [0 : index] : (complex<f32>, f32) -> complex<f32>
-! CHECK: %[[UNDEF2:.*]] = fir.insert_value %[[UNDEF1]], %[[IMAG]], [1 : index] : (complex<f32>, f32) -> complex<f32>
-! CHECK: %[[ALLOCA:.*]] = fir.alloca complex<f32>
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<complex<f32>>) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
-! CHECK: fir.store %[[UNDEF2]] to %[[DECLARE]]#0 : !fir.ref<complex<f32>>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<complex<f32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<complex<f32>>, %[[ARG1:.*]]: !fir.ref<complex<f32>>):
-! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<complex<f32>>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<complex<f32>>
-! CHECK: %[[COMBINED:.*]] = fir.mulc %[[LOAD0]], %[[LOAD1]] {fastmath = #arith.fastmath<contract>} : complex<f32>
-! CHECK: fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<complex<f32>>
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<complex<f32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_z32 : !fir.ref<complex<f32>> reduction_operator <add> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<complex<f32>>):
-! CHECK: %[[REAL:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK: %[[IMAG:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK: %[[UNDEF:.*]] = fir.undefined complex<f32>
-! CHECK: %[[UNDEF1:.*]] = fir.insert_value %[[UNDEF]], %[[REAL]], [0 : index] : (complex<f32>, f32) -> complex<f32>
-! CHECK: %[[UNDEF2:.*]] = fir.insert_value %[[UNDEF1]], %[[IMAG]], [1 : index] : (complex<f32>, f32) -> complex<f32>
-! CHECK: %[[ALLOCA:.*]] = fir.alloca complex<f32>
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<complex<f32>>) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
-! CHECK: fir.store %[[UNDEF2]] to %[[DECLARE]]#0 : !fir.ref<complex<f32>>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<complex<f32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<complex<f32>>, %[[ARG1:.*]]: !fir.ref<complex<f32>>):
-! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<complex<f32>>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<complex<f32>>
-! CHECK: %[[COMBINED:.*]] = fir.addc %[[LOAD0]], %[[LOAD1]] {fastmath = #arith.fastmath<contract>} : complex<f32>
-! CHECK: fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<complex<f32>>
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<complex<f32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_neqv_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <neqv> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.logical<4>>):
-! CHECK: %[[CST:.*]] = arith.constant false
-! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.logical<4>
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK: %[[CONVERT:.*]] = fir.convert %[[CST]] : (i1) -> !fir.logical<4>
-! CHECK: fir.store %[[CONVERT]] to %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
-! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
-! CHECK: %[[CONV0:.*]] = fir.convert %[[LOAD0]] : (!fir.logical<4>) -> i1
-! CHECK: %[[CONV1:.*]] = fir.convert %[[LOAD1]] : (!fir.logical<4>) -> i1
-! CHECK: %[[CMP:.*]] = arith.cmpi ne, %[[CONV0]], %[[CONV1]] : i1
-! CHECK: %[[CMP_CONV:.*]] = fir.convert %[[CMP]] : (i1) -> !fir.logical<4>
-! CHECK: fir.store %[[CMP_CONV]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_eqv_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <eqv> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.logical<4>>):
-! CHECK: %[[CST:.*]] = arith.constant true
-! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.logical<4>
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK: %[[CONVERT:.*]] = fir.convert %[[CST]] : (i1) -> !fir.logical<4>
-! CHECK: fir.store %[[CONVERT]] to %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
-! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
-! CHECK: %[[CONV0:.*]] = fir.convert %[[LOAD0]] : (!fir.logical<4>) -> i1
-! CHECK: %[[CONV1:.*]] = fir.convert %[[LOAD1]] : (!fir.logical<4>) -> i1
-! CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[CONV0]], %[[CONV1]] : i1
-! CHECK: %[[CMP_CONV:.*]] = fir.convert %[[CMP]] : (i1) -> !fir.logical<4>
-! CHECK: fir.store %[[CMP_CONV]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_lor_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <lor> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.logical<4>>):
-! CHECK: %[[CST:.*]] = arith.constant false
-! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.logical<4>
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK: %[[CONVERT:.*]] = fir.convert %[[CST]] : (i1) -> !fir.logical<4>
-! CHECK: fir.store %[[CONVERT]] to %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
-! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
-! CHECK: %[[CONV0:.*]] = fir.convert %[[LOAD0]] : (!fir.logical<4>) -> i1
-! CHECK: %[[CONV1:.*]] = fir.convert %[[LOAD1]] : (!fir.logical<4>) -> i1
-! CHECK: %[[CMP:.*]] = arith.ori %[[CONV0]], %[[CONV1]] : i1
-! CHECK: %[[CMP_CONV:.*]] = fir.convert %[[CMP]] : (i1) -> !fir.logical<4>
-! CHECK: fir.store %[[CMP_CONV]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_land_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <land> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.logical<4>>):
-! CHECK: %[[CST:.*]] = arith.constant true
-! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.logical<4>
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK: %[[CONVERT:.*]] = fir.convert %[[CST]] : (i1) -> !fir.logical<4>
-! CHECK: fir.store %[[CONVERT]] to %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
-! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
-! CHECK: %[[CONV0:.*]] = fir.convert %[[LOAD0]] : (!fir.logical<4>) -> i1
-! CHECK: %[[CONV1:.*]] = fir.convert %[[LOAD1]] : (!fir.logical<4>) -> i1
-! CHECK: %[[CMP:.*]] = arith.andi %[[CONV0]], %[[CONV1]] : i1
-! CHECK: %[[CMP_CONV:.*]] = fir.convert %[[CMP]] : (i1) -> !fir.logical<4>
-! CHECK: fir.store %[[CMP_CONV]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_xor_ref_i32 : !fir.ref<i32> reduction_operator <xor> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
-! CHECK: %[[CST:.*]] = arith.constant 0 : i32
-! CHECK: %[[ALLOCA:.*]] = fir.alloca i32
-! CHECK: %[[DECLARE]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK: fir.store %[[CST]] to %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK: %[[COMBINED:.*]] = arith.xori %[[LOAD0]], %[[LOAD1]] : i32
-! CHECK: fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<i32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_ior_ref_i32 : !fir.ref<i32> reduction_operator <ior> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
-! CHECK: %[[CST:.*]] = arith.constant 0 : i32
-! CHECK: %[[ALLOCA:.*]] = fir.alloca i32
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK: fir.store %[[CST]] to %[[DECLARE:.*]]#0 : !fir.ref<i32>
-! CHECK: acc.yield %[[DECLARE:.*]]#0 : !fir.ref<i32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK: %[[COMBINED:.*]] = arith.ori %[[LOAD0]], %[[LOAD1]] : i32
-! CHECK: fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<i32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_iand_ref_i32 : !fir.ref<i32> reduction_operator <iand> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
-! CHECK: %[[CST:.*]] = arith.constant -1 : i32
-! CHECK: %[[ALLOCA:.*]] = fir.alloca i32
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK: fir.store %[[CST]] to %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK: %[[COMBINED:.*]] = arith.andi %[[LOAD0]], %[[LOAD1]] : i32
-! CHECK: fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<i32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <max> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xf32>>):
-! CHECK: %[[INIT:.*]] = arith.constant -1.401300e-45 : f32
-! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<100xf32>
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
-! CHECK: %[[LB:.*]] = arith.constant 0 : index
-! CHECK: %[[UB:.*]] = arith.constant 99 : index
-! CHECK: %[[STEP:.*]] = arith.constant 1 : index
-! CHECK: fir.do_loop %[[IV:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
-! CHECK: %[[COORD:.*]] = fir.coordinate_of %[[DECLARE]]#0, %[[IV]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
-! CHECK: fir.store %[[INIT]] to %[[COORD]] : !fir.ref<f32>
-! CHECK: }
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xf32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xf32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xf32>>):
-! CHECK: %[[LB0:.*]] = arith.constant 0 : index
-! CHECK: %[[UB0:.*]] = arith.constant 99 : index
-! CHECK: %[[STEP0:.*]] = arith.constant 1 : index
-! CHECK: fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
-! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV0]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
-! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV0]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<f32>
-! CHECK: %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<f32>
-! CHECK: %[[CMP:.*]] = arith.cmpf ogt, %[[LOAD1]], %[[LOAD2]] {{.*}} : f32
-! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD1]], %[[LOAD2]] : f32
-! CHECK: fir.store %[[SELECT]] to %[[COORD1]] : !fir.ref<f32>
-! CHECK: }
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.array<100xf32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_f32 : !fir.ref<f32> reduction_operator <max> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<f32>):
-! CHECK: %[[INIT:.*]] = arith.constant -1.401300e-45 : f32
-! CHECK: %[[ALLOCA:.*]] = fir.alloca f32
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %0 {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK: fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<f32>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<f32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
-! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
-! CHECK: %[[CMP:.*]] = arith.cmpf ogt, %[[LOAD0]], %[[LOAD1]] {{.*}} : f32
-! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD0]], %[[LOAD1]] : f32
-! CHECK: fir.store %[[SELECT]] to %[[ARG0]] : !fir.ref<f32>
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<f32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <max> init {
-! CHECK: ^bb0(%arg0: !fir.ref<!fir.array<100x10xi32>>):
-! CHECK: %[[INIT:.*]] = arith.constant -2147483648 : i32
-! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}}, %{{.*}} : (index, index) -> !fir.shape<2>
-! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<100x10xi32>
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xi32>>, !fir.ref<!fir.array<100x10xi32>>)
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100x10xi32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10xi32>>):
-! CHECK: %[[LB0:.*]] = arith.constant 0 : index
-! CHECK: %[[UB0:.*]] = arith.constant 9 : index
-! CHECK: %[[STEP0:.*]] = arith.constant 1 : index
-! CHECK: fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
-! CHECK: %[[LB1:.*]] = arith.constant 0 : index
-! CHECK: %[[UB1:.*]] = arith.constant 99 : index
-! CHECK: %[[STEP1:.*]] = arith.constant 1 : index
-! CHECK: fir.do_loop %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] {
-! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ARG0:.*]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
-! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG1:.*]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<i32>
-! CHECK: %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<i32>
-! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[LOAD1]], %[[LOAD2]] : i32
-! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD1]], %[[LOAD2]] : i32
-! CHECK: fir.store %[[SELECT]] to %[[COORD1]] : !fir.ref<i32>
-! CHECK: }
-! CHECK: }
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.array<100x10xi32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_i32 : !fir.ref<i32> reduction_operator <max> init {
-! CHECK: ^bb0(%arg0: !fir.ref<i32>):
-! CHECK: %[[INIT:.*]] = arith.constant -2147483648 : i32
-! CHECK: %[[ALLOCA:.*]] = fir.alloca i32
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK: fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[LOAD0]], %[[LOAD1]] : i32
-! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD0]], %[[LOAD1]] : i32
-! CHECK: fir.store %[[SELECT]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<i32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_100x10xf32 : !fir.ref<!fir.array<100x10xf32>> reduction_operator <min> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100x10xf32>>):
-! CHECK: %[[INIT:.*]] = arith.constant 3.40282347E+38 : f32
-! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}}, %{{.*}} : (index, index) -> !fir.shape<2>
-! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<100x10xf32>
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xf32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xf32>>, !fir.ref<!fir.array<100x10xf32>>)
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100x10xf32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10xf32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10xf32>>):
-! CHECK: %[[LB0:.*]] = arith.constant 0 : index
-! CHECK: %[[UB0:.*]] = arith.constant 9 : index
-! CHECK: %[[STEP0:.*]] = arith.constant 1 : index
-! CHECK: fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
-! CHECK: %[[LB1:.*]] = arith.constant 0 : index
-! CHECK: %[[UB1:.*]] = arith.constant 99 : index
-! CHECK: %[[STEP1:.*]] = arith.constant 1 : index
-! CHECK: fir.do_loop %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] {
-! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32>
-! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<f32>
-! CHECK: %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<f32>
-! CHECK: %[[CMP:.*]] = arith.cmpf olt, %[[LOAD1]], %[[LOAD2]] {{.*}} : f32
-! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD1]], %[[LOAD2]] : f32
-! CHECK: fir.store %[[SELECT]] to %[[COORD1]] : !fir.ref<f32>
-! CHECK: }
-! CHECK: }
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.array<100x10xf32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_f32 : !fir.ref<f32> reduction_operator <min> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<f32>):
-! CHECK: %[[INIT:.*]] = arith.constant 3.40282347E+38 : f32
-! CHECK: %[[ALLOCA:.*]] = fir.alloca f32
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK: fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<f32>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<f32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
-! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
-! CHECK: %[[CMP:.*]] = arith.cmpf olt, %[[LOAD0]], %[[LOAD1]] {{.*}} : f32
-! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD0]], %[[LOAD1]] : f32
-! CHECK: fir.store %[[SELECT]] to %[[ARG0]] : !fir.ref<f32>
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<f32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <min> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xi32>>):
-! CHECK: %[[INIT:.*]] = arith.constant 2147483647 : i32
-! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<100xi32>
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xi32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xi32>>):
-! CHECK: %[[LB0:.*]] = arith.constant 0 : index
-! CHECK: %[[UB0:.*]] = arith.constant 99 : index
-! CHECK: %[[STEP0:.*]] = arith.constant 1 : index
-! CHECK: fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
-! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV0]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
-! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV0]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<i32>
-! CHECK: %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<i32>
-! CHECK: %[[CMP:.*]] = arith.cmpi slt, %[[LOAD1]], %[[LOAD2]] : i32
-! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD1]], %[[LOAD2]] : i32
-! CHECK: fir.store %[[SELECT]] to %[[COORD1]] : !fir.ref<i32>
-! CHECK: }
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.array<100xi32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_i32 : !fir.ref<i32> reduction_operator <min> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
-! CHECK: %[[INIT:.*]] = arith.constant 2147483647 : i32
-! CHECK: %[[ALLOCA:.*]] = fir.alloca i32
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK: fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK: %[[CMP:.*]] = arith.cmpi slt, %[[LOAD0]], %[[LOAD1]] : i32
-! CHECK: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD0]], %[[LOAD1]] : i32
-! CHECK: fir.store %[[SELECT]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<i32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_f32 : !fir.ref<f32> reduction_operator <mul> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<f32>):
-! CHECK: %[[INIT:.*]] = arith.constant 1.000000e+00 : f32
-! CHECK: %[[ALLOCA:.*]] = fir.alloca f32
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK: fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<f32>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<f32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
-! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
-! CHECK: %[[COMBINED:.*]] = arith.mulf %[[LOAD0]], %[[LOAD1]] fastmath<contract> : f32
-! CHECK: fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<f32>
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<f32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <mul> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xi32>>):
-! CHECK: %[[INIT:.*]] = arith.constant 1 : i32
-! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<100xi32>
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xi32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xi32>>):
-! CHECK: %[[LB:.*]] = arith.constant 0 : index
-! CHECK: %[[UB:.*]] = arith.constant 99 : index
-! CHECK: %[[STEP:.*]] = arith.constant 1 : index
-! CHECK: fir.do_loop %[[IV:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
-! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
-! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<i32>
-! CHECK: %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<i32>
-! CHECK: %[[COMBINED:.*]] = arith.muli %[[LOAD1]], %[[LOAD2]] : i32
-! CHECK: fir.store %[[COMBINED]] to %[[COORD1]] : !fir.ref<i32>
-! CHECK: }
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.array<100xi32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_i32 : !fir.ref<i32> reduction_operator <mul> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
-! CHECK: %[[INIT:.*]] = arith.constant 1 : i32
-! CHECK: %[[ALLOCA:.*]] = fir.alloca i32
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK: fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK: %[[COMBINED:.*]] = arith.muli %[[LOAD0]], %[[LOAD1]] : i32
-! CHECK: fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<i32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <add> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xf32>>):
-! CHECK: %[[INIT:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<100xf32>
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xf32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xf32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xf32>>):
-! CHECK: %[[LB:.*]] = arith.constant 0 : index
-! CHECK: %[[UB:.*]] = arith.constant 99 : index
-! CHECK: %[[STEP:.*]] = arith.constant 1 : index
-! CHECK: fir.do_loop %[[IV:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
-! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
-! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<f32>
-! CHECK: %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<f32>
-! CHECK: %[[COMBINED:.*]] = arith.addf %[[LOAD1]], %[[LOAD2]] fastmath<contract> : f32
-! CHECK: fir.store %[[COMBINED]] to %[[COORD1]] : !fir.ref<f32>
-! CHECK: }
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.array<100xf32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_f32 : !fir.ref<f32> reduction_operator <add> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<f32>):
-! CHECK: %[[INIT:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK: %[[ALLOCA:.*]] = fir.alloca f32
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK: fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<f32>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<f32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
-! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
-! CHECK: %[[COMBINED:.*]] = arith.addf %[[LOAD0]], %[[LOAD1]] fastmath<contract> : f32
-! CHECK: fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<f32>
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<f32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100x10x2xi32 : !fir.ref<!fir.array<100x10x2xi32>> reduction_operator <add> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100x10x2xi32>>):
-! CHECK: %[[INIT:.*]] = arith.constant 0 : i32
-! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}}, %{{.*}}, %{{.*}} : (index, index, index) -> !fir.shape<3>
-! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<100x10x2xi32>
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10x2xi32>>, !fir.shape<3>) -> (!fir.ref<!fir.array<100x10x2xi32>>, !fir.ref<!fir.array<100x10x2xi32>>)
-! CHECK: %[[LB0:.*]] = arith.constant 0 : index
-! CHECK: %[[UB0:.*]] = arith.constant 1 : index
-! CHECK: %[[STEP0:.*]] = arith.constant 1 : index
-! CHECK: fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
-! CHECK: %[[LB1:.*]] = arith.constant 0 : index
-! CHECK: %[[UB1:.*]] = arith.constant 9 : index
-! CHECK: %[[STEP1:.*]] = arith.constant 1 : index
-! CHECK: fir.do_loop %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] {
-! CHECK: %[[LB2:.*]] = arith.constant 0 : index
-! CHECK: %[[UB2:.*]] = arith.constant 99 : index
-! CHECK: %[[STEP2:.*]] = arith.constant 1 : index
-! CHECK: fir.do_loop %[[IV2:.*]] = %[[LB2]] to %[[UB2]] step %[[STEP2]] {
-! CHECK: %[[COORD]] = fir.coordinate_of %[[DECLARE]]#0, %[[IV2]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32>
-! CHECK: fir.store %[[INIT]] to %[[COORD]] : !fir.ref<i32>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100x10x2xi32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10x2xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10x2xi32>>):
-! CHECK: %[[LB0:.*]] = arith.constant 0 : index
-! CHECK: %[[UB0:.*]] = arith.constant 1 : index
-! CHECK: %[[STEP0:.*]] = arith.constant 1 : index
-! CHECK: fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
-! CHECK: %[[LB1:.*]] = arith.constant 0 : index
-! CHECK: %[[UB1:.*]] = arith.constant 9 : index
-! CHECK: %[[STEP1:.*]] = arith.constant 1 : index
-! CHECK: fir.do_loop %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] {
-! CHECK: %[[LB2:.*]] = arith.constant 0 : index
-! CHECK: %[[UB2:.*]] = arith.constant 99 : index
-! CHECK: %[[STEP2:.*]] = arith.constant 1 : index
-! CHECK: fir.do_loop %[[IV2:.*]] = %[[LB2]] to %[[UB2]] step %[[STEP2]] {
-! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV2]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32>
-! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV2]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<i32>
-! CHECK: %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<i32>
-! CHECK: %[[COMBINED:.*]] = arith.addi %[[LOAD1]], %[[LOAD2]] : i32
-! CHECK: fir.store %[[COMBINED]] to %[[COORD1]] : !fir.ref<i32>
-! CHECK: }
-! CHECK: }
-! CHECK: }
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.array<100x10x2xi32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <add> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100x10xi32>>):
-! CHECK: %[[INIT:.*]] = arith.constant 0 : i32
-! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}}, %{{.*}} : (index, index) -> !fir.shape<2>
-! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<100x10xi32>
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xi32>>, !fir.ref<!fir.array<100x10xi32>>)
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100x10xi32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10xi32>>):
-! CHECK: %[[LB0:.*]] = arith.constant 0 : index
-! CHECK: %[[UB0:.*]] = arith.constant 9 : index
-! CHECK: %[[STEP0:.*]] = arith.constant 1 : index
-! CHECK: fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
-! CHECK: %[[LB1:.*]] = arith.constant 0 : index
-! CHECK: %[[UB1:.*]] = arith.constant 99 : index
-! CHECK: %[[STEP1:.*]] = arith.constant 1 : index
-! CHECK: fir.do_loop %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] {
-! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
-! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
-! CHECK: %[[LOAD1]] = fir.load %[[COORD1]] : !fir.ref<i32>
-! CHECK: %[[LOAD2]] = fir.load %[[COORD2]] : !fir.ref<i32>
-! CHECK: %[[COMBINED:.*]] = arith.addi %[[LOAD1]], %[[LOAD2]] : i32
-! CHECK: fir.store %[[COMBINED]] to %[[COORD1]] : !fir.ref<i32>
-! CHECK: }
-! CHECK: }
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.array<100x10xi32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <add> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xi32>>):
-! CHECK: %[[INIT:.*]] = arith.constant 0 : i32
-! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<100xi32>
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
-! HFLIR: acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xi32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xi32>>):
-! CHECK: %[[LB:.*]] = arith.constant 0 : index
-! CHECK: %[[UB:.*]] = arith.constant 99 : index
-! CHECK: %[[STEP:.*]] = arith.constant 1 : index
-! CHECK: fir.do_loop %[[IV:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
-! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
-! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<i32>
-! CHECK: %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<i32>
-! CHECK: %[[COMBINED:.*]] = arith.addi %[[LOAD1]], %[[LOAD2]] : i32
-! CHECK: fir.store %[[COMBINED]] to %[[COORD1]] : !fir.ref<i32>
-! CHECK: }
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<!fir.array<100xi32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_i32 : !fir.ref<i32> reduction_operator <add> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
-! CHECK: %[[INIT:.*]] = arith.constant 0 : i32
-! CHECK: %[[ALLOCA:.*]] = fir.alloca i32
-! CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK: fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK: acc.yield %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK: %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK: %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK: %[[COMBINED:.*]] = arith.addi %[[LOAD0]], %[[LOAD1]] : i32
-! CHECK: fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK: acc.yield %[[ARG0]] : !fir.ref<i32>
-! CHECK: }
+! CHECK-LABEL: acc.reduction.recipe @reduction_max_box_UxUxf32 : !fir.box<!fir.array<?x?xf32>> reduction_operator <max> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?x?xf32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant -1.401300e-45 : f32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK: %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_2]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1, %[[BOX_DIMS_1]]#1 : (index, index) -> !fir.shape<2>
+! CHECK: %[[ALLOCMEM_0:.*]] = fir.allocmem !fir.array<?x?xf32>, %[[BOX_DIMS_0]]#1, %[[BOX_DIMS_1]]#1 {bindc_name = ".tmp", uniq_name = ""}
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCMEM_0]](%[[SHAPE_0]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.heap<!fir.array<?x?xf32>>)
+! CHECK: hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : f32, !fir.box<!fir.array<?x?xf32>>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.box<!fir.array<?x?xf32>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?x?xf32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?x?xf32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_0]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 1 : index
+! CHECK: %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1, %[[BOX_DIMS_1]]#1 : (index, index) -> !fir.shape<2>
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_2:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_2]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 1 : index
+! CHECK: %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_3]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[BOX_DIMS_2]]#1, %[[BOX_DIMS_3]]#1 : (index, index) -> !fir.shape<2>
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[BOX_DIMS_1]]#1 step %[[CONSTANT_4]] unordered {
+! CHECK: fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_4]] to %[[BOX_DIMS_0]]#1 step %[[CONSTANT_4]] unordered {
+! CHECK: %[[CONSTANT_5:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_4:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_5]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index
+! CHECK: %[[BOX_DIMS_5:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_6]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK: %[[CONSTANT_7:.*]] = arith.constant 1 : index
+! CHECK: %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_4]]#0, %[[CONSTANT_7]] : index
+! CHECK: %[[ADDI_0:.*]] = arith.addi %[[VAL_3]], %[[SUBI_0]] : index
+! CHECK: %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_5]]#0, %[[CONSTANT_7]] : index
+! CHECK: %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
+! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[ADDI_0]], %[[ADDI_1]]) : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
+! CHECK: %[[CONSTANT_8:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_8]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK: %[[CONSTANT_9:.*]] = arith.constant 1 : index
+! CHECK: %[[BOX_DIMS_7:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_9]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK: %[[CONSTANT_10:.*]] = arith.constant 1 : index
+! CHECK: %[[SUBI_2:.*]] = arith.subi %[[BOX_DIMS_6]]#0, %[[CONSTANT_10]] : index
+! CHECK: %[[ADDI_2:.*]] = arith.addi %[[VAL_3]], %[[SUBI_2]] : index
+! CHECK: %[[SUBI_3:.*]] = arith.subi %[[BOX_DIMS_7]]#0, %[[CONSTANT_10]] : index
+! CHECK: %[[ADDI_3:.*]] = arith.addi %[[VAL_2]], %[[SUBI_3]] : index
+! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[ADDI_2]], %[[ADDI_3]]) : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
+! CHECK: %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
+! CHECK: hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
+! CHECK: }
+! CHECK: }
+! CHECK: acc.yield %[[VAL_0]] : !fir.box<!fir.array<?x?xf32>>
+
+! CHECK-LABEL: } destroy {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?x?xf32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?x?xf32>>):
+! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[BOX_ADDR_0]] : (!fir.ref<!fir.array<?x?xf32>>) -> !fir.heap<!fir.array<?x?xf32>>
+! CHECK: fir.freemem %[[CONVERT_0]] : !fir.heap<!fir.array<?x?xf32>>
+! CHECK: acc.terminator
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_box_ptr_Uxf32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> reduction_operator <max> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant -1.401300e-45 : f32
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[LOAD_0]], %[[CONSTANT_1]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1>
+! CHECK: %[[ALLOCMEM_0:.*]] = fir.allocmem !fir.array<?xf32>, %[[BOX_DIMS_0]]#1 {bindc_name = ".tmp", uniq_name = ""}
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCMEM_0]](%[[SHAPE_0]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xf32>>>
+! CHECK: %[[DECLARE_1:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[DECLARE_1]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.array<?xf32>>>
+! CHECK: fir.store %[[DECLARE_0]]#0 to %[[CONVERT_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+! CHECK: hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_1]]#0 : f32, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK: acc.yield %[[DECLARE_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[LOAD_1]], %[[CONSTANT_0]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[LOAD_0]], %[[CONSTANT_1]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[BOX_DIMS_1]]#1 : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[BOX_DIMS_0]]#1 step %[[CONSTANT_2]] unordered {
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_2:.*]]:3 = fir.box_dims %[[LOAD_0]], %[[CONSTANT_3]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK: %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index
+! CHECK: %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index
+! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[LOAD_0]] (%[[ADDI_0]]) : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_2:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
+! CHECK: %[[CONSTANT_5:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[LOAD_1]], %[[CONSTANT_5]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index
+! CHECK: %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index
+! CHECK: %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
+! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[LOAD_1]] (%[[ADDI_1]]) : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_3:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
+! CHECK: %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_3]], %[[LOAD_2]] fastmath<contract> : f32
+! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_3]], %[[LOAD_2]] : f32
+! CHECK: hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
+! CHECK: }
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+
+! CHECK-LABEL: } destroy {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[LOAD_0]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.ptr<!fir.array<?xf32>>
+! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[BOX_ADDR_0]] : (!fir.ptr<!fir.array<?xf32>>) -> !fir.heap<!fir.array<?xf32>>
+! CHECK: fir.freemem %[[CONVERT_0]] : !fir.heap<!fir.array<?xf32>>
+! CHECK: acc.terminator
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_box_heap_Uxf32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> reduction_operator <max> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant -1.401300e-45 : f32
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[LOAD_0]], %[[CONSTANT_1]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1>
+! CHECK: %[[ALLOCMEM_0:.*]] = fir.allocmem !fir.array<?xf32>, %[[BOX_DIMS_0]]#1 {bindc_name = ".tmp", uniq_name = ""}
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCMEM_0]](%[[SHAPE_0]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>>
+! CHECK: %[[DECLARE_1:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[DECLARE_1]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.array<?xf32>>>
+! CHECK: fir.store %[[DECLARE_0]]#0 to %[[CONVERT_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+! CHECK: hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_1]]#0 : f32, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK: acc.yield %[[DECLARE_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[LOAD_1]], %[[CONSTANT_0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[LOAD_0]], %[[CONSTANT_1]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[BOX_DIMS_1]]#1 : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[BOX_DIMS_0]]#1 step %[[CONSTANT_2]] unordered {
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_2:.*]]:3 = fir.box_dims %[[LOAD_0]], %[[CONSTANT_3]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK: %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index
+! CHECK: %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index
+! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[LOAD_0]] (%[[ADDI_0]]) : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_2:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
+! CHECK: %[[CONSTANT_5:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[LOAD_1]], %[[CONSTANT_5]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index
+! CHECK: %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index
+! CHECK: %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
+! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[LOAD_1]] (%[[ADDI_1]]) : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_3:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
+! CHECK: %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_3]], %[[LOAD_2]] fastmath<contract> : f32
+! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_3]], %[[LOAD_2]] : f32
+! CHECK: hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
+! CHECK: }
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+
+! CHECK-LABEL: } destroy {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[LOAD_0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
+! CHECK: fir.freemem %[[BOX_ADDR_0]] : !fir.heap<!fir.array<?xf32>>
+! CHECK: acc.terminator
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_add_section_lb1.ub3_box_Uxi32 : !fir.box<!fir.array<?xi32>> reduction_operator <add> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1>
+! CHECK: %[[ALLOCMEM_0:.*]] = fir.allocmem !fir.array<?xi32>, %[[BOX_DIMS_0]]#1 {bindc_name = ".tmp", uniq_name = ""}
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCMEM_0]](%[[SHAPE_0]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
+! CHECK: hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : i32, !fir.box<!fir.array<?xi32>>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.box<!fir.array<?xi32>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 1 : index
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 3 : index
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK: %[[SUBI_0:.*]] = arith.subi %[[CONSTANT_1]], %[[CONSTANT_0]] : index
+! CHECK: %[[ADDI_0:.*]] = arith.addi %[[SUBI_0]], %[[CONSTANT_2]] : index
+! CHECK: %[[DIVSI_0:.*]] = arith.divsi %[[ADDI_0]], %[[CONSTANT_2]] : index
+! CHECK: %[[CMPI_0:.*]] = arith.cmpi sgt, %[[DIVSI_0]], %[[CONSTANT_3]] : index
+! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[DIVSI_0]], %[[CONSTANT_3]] : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[SELECT_0]] : (index) -> !fir.shape<1>
+! CHECK: %[[BD_LHS:.*]]:3 = fir.box_dims %[[VAL_0]], %c0{{.*}} : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK: %[[LB_LHS:.*]] = arith.addi %[[BD_LHS]]#0, %c1{{.*}} : index
+! CHECK: %[[UB_LHS:.*]] = arith.addi %[[BD_LHS]]#0, %c3{{.*}} : index
+! CHECK: %[[BD_RHS:.*]]:3 = fir.box_dims %[[VAL_1]], %c0{{.*}} : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK: %[[LB_RHS:.*]] = arith.addi %[[BD_RHS]]#0, %c1{{.*}} : index
+! CHECK: %[[UB_RHS:.*]] = arith.addi %[[BD_RHS]]#0, %c3{{.*}} : index
+! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[LB_RHS]]:%[[UB_RHS]]:%c1{{.*}}) shape %[[SHAPE_0]] : (!fir.box<!fir.array<?xi32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[LB_LHS]]:%[[UB_LHS]]:%c1{{.*}}) shape %[[SHAPE_0]] : (!fir.box<!fir.array<?xi32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[SELECT_0]] step %[[CONSTANT_4]] unordered {
+! CHECK: %[[DESIGNATE_2:.*]] = hlfir.designate %[[DESIGNATE_0]] (%[[VAL_2]]) : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32>
+! CHECK: %[[DESIGNATE_3:.*]] = hlfir.designate %[[DESIGNATE_1]] (%[[VAL_2]]) : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_3]] : !fir.ref<i32>
+! CHECK: %[[ADDI_1:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: hlfir.assign %[[ADDI_1]] to %[[DESIGNATE_3]] : i32, !fir.ref<i32>
+! CHECK: }
+! CHECK: acc.yield %[[VAL_0]] : !fir.box<!fir.array<?xi32>>
+
+! CHECK-LABEL: } destroy {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>):
+! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[BOX_ADDR_0]] : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK: fir.freemem %[[CONVERT_0]] : !fir.heap<!fir.array<?xi32>>
+! CHECK: acc.terminator
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_max_box_Uxf32 : !fir.box<!fir.array<?xf32>> reduction_operator <max> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant -1.401300e-45 : f32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1>
+! CHECK: %[[ALLOCMEM_0:.*]] = fir.allocmem !fir.array<?xf32>, %[[BOX_DIMS_0]]#1 {bindc_name = ".tmp", uniq_name = ""}
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCMEM_0]](%[[SHAPE_0]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
+! CHECK: hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : f32, !fir.box<!fir.array<?xf32>>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.box<!fir.array<?xf32>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xf32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_0]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[BOX_DIMS_1]]#1 : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[BOX_DIMS_0]]#1 step %[[CONSTANT_2]] unordered {
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_2:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_3]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK: %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index
+! CHECK: %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index
+! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[ADDI_0]]) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
+! CHECK: %[[CONSTANT_5:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_5]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+! CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index
+! CHECK: %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index
+! CHECK: %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
+! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[ADDI_1]]) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
+! CHECK: %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
+! CHECK: hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
+! CHECK: }
+! CHECK: acc.yield %[[VAL_0]] : !fir.box<!fir.array<?xf32>>
+
+! CHECK-LABEL: } destroy {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xf32>>):
+! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[BOX_ADDR_0]] : (!fir.ref<!fir.array<?xf32>>) -> !fir.heap<!fir.array<?xf32>>
+! CHECK: fir.freemem %[[CONVERT_0]] : !fir.heap<!fir.array<?xf32>>
+! CHECK: acc.terminator
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_add_box_Uxi32 : !fir.box<!fir.array<?xi32>> reduction_operator <add> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1>
+! CHECK: %[[ALLOCMEM_0:.*]] = fir.allocmem !fir.array<?xi32>, %[[BOX_DIMS_0]]#1 {bindc_name = ".tmp", uniq_name = ""}
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCMEM_0]](%[[SHAPE_0]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
+! CHECK: hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : i32, !fir.box<!fir.array<?xi32>>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.box<!fir.array<?xi32>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_0]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[BOX_DIMS_1]]#1 : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[BOX_DIMS_0]]#1 step %[[CONSTANT_2]] unordered {
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_2:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_3]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK: %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index
+! CHECK: %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index
+! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[ADDI_0]]) : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
+! CHECK: %[[CONSTANT_5:.*]] = arith.constant 0 : index
+! CHECK: %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index
+! CHECK: %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index
+! CHECK: %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
+! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[ADDI_1]]) : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
+! CHECK: %[[ADDI_2:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: hlfir.assign %[[ADDI_2]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
+! CHECK: }
+! CHECK: acc.yield %[[VAL_0]] : !fir.box<!fir.array<?xi32>>
+
+! CHECK-LABEL: } destroy {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>):
+! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[BOX_ADDR_0]] : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK: fir.freemem %[[CONVERT_0]] : !fir.heap<!fir.array<?xi32>>
+! CHECK: acc.terminator
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_add_section_lb0.ub9xlb0.ub19_ref_10x20xi32 : !fir.ref<!fir.array<10x20xi32>> reduction_operator <add> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<10x20xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 10 : index
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 20 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]], %[[CONSTANT_2]] : (index, index) -> !fir.shape<2>
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<10x20xi32>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<10x20xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<10x20xi32>>, !fir.ref<!fir.array<10x20xi32>>)
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 19 : index
+! CHECK: %[[CONSTANT_5:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_3]] to %[[CONSTANT_4]] step %[[CONSTANT_5]] {
+! CHECK: %[[CONSTANT_6:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_7:.*]] = arith.constant 9 : index
+! CHECK: %[[CONSTANT_8:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_7]] step %[[CONSTANT_8]] {
+! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_2]], %[[VAL_1]] : (!fir.ref<!fir.array<10x20xi32>>, index, index) -> !fir.ref<i32>
+! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32>
+! CHECK: }
+! CHECK: }
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<10x20xi32>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<10x20xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<10x20xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 9 : index
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 19 : index
+! CHECK: %[[CONSTANT_5:.*]] = arith.constant 1 : index
+! CHECK: %[[CONSTANT_6:.*]] = arith.constant 0 : index
+! CHECK: %[[SUBI_0:.*]] = arith.subi %[[CONSTANT_1]], %[[CONSTANT_0]] : index
+! CHECK: %[[ADDI_0:.*]] = arith.addi %[[SUBI_0]], %[[CONSTANT_2]] : index
+! CHECK: %[[DIVSI_0:.*]] = arith.divsi %[[ADDI_0]], %[[CONSTANT_2]] : index
+! CHECK: %[[CMPI_0:.*]] = arith.cmpi sgt, %[[DIVSI_0]], %[[CONSTANT_6]] : index
+! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[DIVSI_0]], %[[CONSTANT_6]] : index
+! CHECK: %[[CONSTANT_7:.*]] = arith.constant 0 : index
+! CHECK: %[[SUBI_1:.*]] = arith.subi %[[CONSTANT_4]], %[[CONSTANT_3]] : index
+! CHECK: %[[ADDI_1:.*]] = arith.addi %[[SUBI_1]], %[[CONSTANT_5]] : index
+! CHECK: %[[DIVSI_1:.*]] = arith.divsi %[[ADDI_1]], %[[CONSTANT_5]] : index
+! CHECK: %[[CMPI_1:.*]] = arith.cmpi sgt, %[[DIVSI_1]], %[[CONSTANT_7]] : index
+! CHECK: %[[SELECT_1:.*]] = arith.select %[[CMPI_1]], %[[DIVSI_1]], %[[CONSTANT_7]] : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[SELECT_0]], %[[SELECT_1]] : (index, index) -> !fir.shape<2>
+! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%c1{{.*}}:%c10{{.*}}:%c1{{.*}}, %c1{{.*}}:%c20{{.*}}:%c1{{.*}}) shape %[[SHAPE_0]] : (!fir.ref<!fir.array<10x20xi32>>, index, index, index, index, index, index, !fir.shape<2>) -> !fir.ref<!fir.array<10x20xi32>>
+! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%c1{{.*}}:%c10{{.*}}:%c1{{.*}}, %c1{{.*}}:%c20{{.*}}:%c1{{.*}}) shape %[[SHAPE_0]] : (!fir.ref<!fir.array<10x20xi32>>, index, index, index, index, index, index, !fir.shape<2>) -> !fir.ref<!fir.array<10x20xi32>>
+! CHECK: %[[CONSTANT_8:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_8]] to %[[SELECT_1]] step %[[CONSTANT_8]] unordered {
+! CHECK: fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_8]] to %[[SELECT_0]] step %[[CONSTANT_8]] unordered {
+! CHECK: %[[DESIGNATE_2:.*]] = hlfir.designate %[[DESIGNATE_0]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<10x20xi32>>, index, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32>
+! CHECK: %[[DESIGNATE_3:.*]] = hlfir.designate %[[DESIGNATE_1]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<10x20xi32>>, index, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_3]] : !fir.ref<i32>
+! CHECK: %[[ADDI_2:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: hlfir.assign %[[ADDI_2]] to %[[DESIGNATE_3]] : i32, !fir.ref<i32>
+! CHECK: }
+! CHECK: }
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<10x20xi32>>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_add_section_lb10.ub19_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <add> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xi32>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 99 : index
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] {
+! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32>
+! CHECK: }
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xi32>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 10 : index
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 19 : index
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK: %[[SUBI_0:.*]] = arith.subi %[[CONSTANT_1]], %[[CONSTANT_0]] : index
+! CHECK: %[[ADDI_0:.*]] = arith.addi %[[SUBI_0]], %[[CONSTANT_2]] : index
+! CHECK: %[[DIVSI_0:.*]] = arith.divsi %[[ADDI_0]], %[[CONSTANT_2]] : index
+! CHECK: %[[CMPI_0:.*]] = arith.cmpi sgt, %[[DIVSI_0]], %[[CONSTANT_3]] : index
+! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[DIVSI_0]], %[[CONSTANT_3]] : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[SELECT_0]] : (index) -> !fir.shape<1>
+! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%c11{{.*}}:%c20{{.*}}:%c1{{.*}}) shape %[[SHAPE_0]] : (!fir.ref<!fir.array<100xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<100xi32>>
+! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%c11{{.*}}:%c20{{.*}}:%c1{{.*}}) shape %[[SHAPE_0]] : (!fir.ref<!fir.array<100xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<100xi32>>
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[SELECT_0]] step %[[CONSTANT_4]] unordered {
+! CHECK: %[[DESIGNATE_2:.*]] = hlfir.designate %[[DESIGNATE_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32>
+! CHECK: %[[DESIGNATE_3:.*]] = hlfir.designate %[[DESIGNATE_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_3]] : !fir.ref<i32>
+! CHECK: %[[ADDI_1:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: hlfir.assign %[[ADDI_1]] to %[[DESIGNATE_3]] : i32, !fir.ref<i32>
+! CHECK: }
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xi32>>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_box_ptr_i32 : !fir.ref<!fir.box<!fir.ptr<i32>>> reduction_operator <add> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.box<!fir.ptr<i32>>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+! CHECK: %[[ALLOCMEM_0:.*]] = fir.allocmem i32
+! CHECK: %[[EMBOX_0:.*]] = fir.embox %[[ALLOCMEM_0]] : (!fir.heap<i32>) -> !fir.box<!fir.ptr<i32>>
+! CHECK: fir.store %[[EMBOX_0]] to %[[DECLARE_0]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+! CHECK: hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : i32, !fir.ref<!fir.box<!fir.ptr<i32>>>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
+! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[LOAD_0]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
+! CHECK: %[[BOX_ADDR_1:.*]] = fir.box_addr %[[LOAD_1]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
+! CHECK: %[[LOAD_2:.*]] = fir.load %[[BOX_ADDR_0]] : !fir.ptr<i32>
+! CHECK: %[[LOAD_3:.*]] = fir.load %[[BOX_ADDR_1]] : !fir.ptr<i32>
+! CHECK: %[[ADDI_0:.*]] = arith.addi %[[LOAD_3]], %[[LOAD_2]] : i32
+! CHECK: hlfir.assign %[[ADDI_0]] to %[[BOX_ADDR_1]] : i32, !fir.ptr<i32>
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
+
+! CHECK-LABEL: } destroy {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
+! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[LOAD_0]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
+! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[BOX_ADDR_0]] : (!fir.ptr<i32>) -> !fir.heap<i32>
+! CHECK: fir.freemem %[[CONVERT_0]] : !fir.heap<i32>
+! CHECK: acc.terminator
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_box_heap_i32 : !fir.ref<!fir.box<!fir.heap<i32>>> reduction_operator <add> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.box<!fir.heap<i32>>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+! CHECK: %[[ALLOCMEM_0:.*]] = fir.allocmem i32
+! CHECK: %[[EMBOX_0:.*]] = fir.embox %[[ALLOCMEM_0]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
+! CHECK: fir.store %[[EMBOX_0]] to %[[DECLARE_0]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK: hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : i32, !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[LOAD_0]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK: %[[BOX_ADDR_1:.*]] = fir.box_addr %[[LOAD_1]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
+! CHECK: %[[LOAD_2:.*]] = fir.load %[[BOX_ADDR_0]] : !fir.heap<i32>
+! CHECK: %[[LOAD_3:.*]] = fir.load %[[BOX_ADDR_1]] : !fir.heap<i32>
+! CHECK: %[[ADDI_0:.*]] = arith.addi %[[LOAD_3]], %[[LOAD_2]] : i32
+! CHECK: hlfir.assign %[[ADDI_0]] to %[[BOX_ADDR_1]] : i32, !fir.heap<i32>
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+
+! CHECK-LABEL: } destroy {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK: %[[BOX_ADDR_0:.*]] = fir.box_addr %[[LOAD_0]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
+! CHECK: fir.freemem %[[BOX_ADDR_0]] : !fir.heap<i32>
+! CHECK: acc.terminator
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_z32 : !fir.ref<complex<f32>> reduction_operator <mul> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<complex<f32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK: %[[UNDEFINED_0:.*]] = fir.undefined complex<f32>
+! CHECK: %[[INSERT_VALUE_0:.*]] = fir.insert_value %[[UNDEFINED_0]], %[[CONSTANT_0]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+! CHECK: %[[INSERT_VALUE_1:.*]] = fir.insert_value %[[INSERT_VALUE_0]], %[[CONSTANT_1]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca complex<f32>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<complex<f32>>) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
+! CHECK: fir.store %[[INSERT_VALUE_1]] to %[[DECLARE_0]]#0 : !fir.ref<complex<f32>>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<complex<f32>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<complex<f32>>, %[[VAL_1:.*]]: !fir.ref<complex<f32>>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<complex<f32>>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<complex<f32>>
+! CHECK: %[[MULC_0:.*]] = fir.mulc %[[LOAD_1]], %[[LOAD_0]] {fastmath = #arith.fastmath<contract>} : complex<f32>
+! CHECK: hlfir.assign %[[MULC_0]] to %[[VAL_0]] : complex<f32>, !fir.ref<complex<f32>>
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<complex<f32>>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_z32 : !fir.ref<complex<f32>> reduction_operator <add> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<complex<f32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK: %[[UNDEFINED_0:.*]] = fir.undefined complex<f32>
+! CHECK: %[[INSERT_VALUE_0:.*]] = fir.insert_value %[[UNDEFINED_0]], %[[CONSTANT_0]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+! CHECK: %[[INSERT_VALUE_1:.*]] = fir.insert_value %[[INSERT_VALUE_0]], %[[CONSTANT_1]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca complex<f32>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<complex<f32>>) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
+! CHECK: fir.store %[[INSERT_VALUE_1]] to %[[DECLARE_0]]#0 : !fir.ref<complex<f32>>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<complex<f32>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<complex<f32>>, %[[VAL_1:.*]]: !fir.ref<complex<f32>>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<complex<f32>>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<complex<f32>>
+! CHECK: %[[ADDC_0:.*]] = fir.addc %[[LOAD_1]], %[[LOAD_0]] {fastmath = #arith.fastmath<contract>} : complex<f32>
+! CHECK: hlfir.assign %[[ADDC_0]] to %[[VAL_0]] : complex<f32>, !fir.ref<complex<f32>>
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<complex<f32>>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_neqv_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <neqv> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant false
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.logical<4>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[CONSTANT_0]] : (i1) -> !fir.logical<4>
+! CHECK: fir.store %[[CONVERT_0]] to %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>, %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.logical<4>>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.logical<4>>
+! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[LOAD_1]] : (!fir.logical<4>) -> i1
+! CHECK: %[[CONVERT_1:.*]] = fir.convert %[[LOAD_0]] : (!fir.logical<4>) -> i1
+! CHECK: %[[CMPI_0:.*]] = arith.cmpi ne, %[[CONVERT_0]], %[[CONVERT_1]] : i1
+! CHECK: %[[CONVERT_2:.*]] = fir.convert %[[CMPI_0]] : (i1) -> !fir.logical<4>
+! CHECK: hlfir.assign %[[CONVERT_2]] to %[[VAL_0]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.logical<4>>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_eqv_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <eqv> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant true
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.logical<4>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[CONSTANT_0]] : (i1) -> !fir.logical<4>
+! CHECK: fir.store %[[CONVERT_0]] to %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>, %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.logical<4>>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.logical<4>>
+! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[LOAD_1]] : (!fir.logical<4>) -> i1
+! CHECK: %[[CONVERT_1:.*]] = fir.convert %[[LOAD_0]] : (!fir.logical<4>) -> i1
+! CHECK: %[[CMPI_0:.*]] = arith.cmpi eq, %[[CONVERT_0]], %[[CONVERT_1]] : i1
+! CHECK: %[[CONVERT_2:.*]] = fir.convert %[[CMPI_0]] : (i1) -> !fir.logical<4>
+! CHECK: hlfir.assign %[[CONVERT_2]] to %[[VAL_0]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.logical<4>>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_lor_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <lor> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant false
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.logical<4>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[CONSTANT_0]] : (i1) -> !fir.logical<4>
+! CHECK: fir.store %[[CONVERT_0]] to %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>, %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.logical<4>>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.logical<4>>
+! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[LOAD_1]] : (!fir.logical<4>) -> i1
+! CHECK: %[[CONVERT_1:.*]] = fir.convert %[[LOAD_0]] : (!fir.logical<4>) -> i1
+! CHECK: %[[ORI_0:.*]] = arith.ori %[[CONVERT_0]], %[[CONVERT_1]] : i1
+! CHECK: %[[CONVERT_2:.*]] = fir.convert %[[ORI_0]] : (i1) -> !fir.logical<4>
+! CHECK: hlfir.assign %[[CONVERT_2]] to %[[VAL_0]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.logical<4>>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_land_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <land> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant true
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.logical<4>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[CONSTANT_0]] : (i1) -> !fir.logical<4>
+! CHECK: fir.store %[[CONVERT_0]] to %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>, %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.logical<4>>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.logical<4>>
+! CHECK: %[[CONVERT_0:.*]] = fir.convert %[[LOAD_1]] : (!fir.logical<4>) -> i1
+! CHECK: %[[CONVERT_1:.*]] = fir.convert %[[LOAD_0]] : (!fir.logical<4>) -> i1
+! CHECK: %[[ANDI_0:.*]] = arith.andi %[[CONVERT_0]], %[[CONVERT_1]] : i1
+! CHECK: %[[CONVERT_2:.*]] = fir.convert %[[ANDI_0]] : (i1) -> !fir.logical<4>
+! CHECK: hlfir.assign %[[CONVERT_2]] to %[[VAL_0]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.logical<4>>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_xor_ref_i32 : !fir.ref<i32> reduction_operator <xor> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
+! CHECK: %[[XORI_0:.*]] = arith.xori %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: hlfir.assign %[[XORI_0]] to %[[VAL_0]] : i32, !fir.ref<i32>
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<i32>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_ior_ref_i32 : !fir.ref<i32> reduction_operator <ior> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
+! CHECK: %[[ORI_0:.*]] = arith.ori %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: hlfir.assign %[[ORI_0]] to %[[VAL_0]] : i32, !fir.ref<i32>
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<i32>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_iand_ref_i32 : !fir.ref<i32> reduction_operator <iand> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant -1 : i32
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
+! CHECK: %[[ANDI_0:.*]] = arith.andi %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: hlfir.assign %[[ANDI_0]] to %[[VAL_0]] : i32, !fir.ref<i32>
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<i32>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <max> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xf32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant -1.401300e-45 : f32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xf32>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 99 : index
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] {
+! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<f32>
+! CHECK: }
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xf32>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xf32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xf32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]] : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
+! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
+! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
+! CHECK: %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
+! CHECK: hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
+! CHECK: }
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xf32>>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_f32 : !fir.ref<f32> reduction_operator <max> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant -1.401300e-45 : f32
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca f32
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<f32>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<f32>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>, %[[VAL_1:.*]]: !fir.ref<f32>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<f32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<f32>
+! CHECK: %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
+! CHECK: hlfir.assign %[[SELECT_0]] to %[[VAL_0]] : f32, !fir.ref<f32>
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<f32>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <max> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant -2147483648 : i32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 10 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]], %[[CONSTANT_2]] : (index, index) -> !fir.shape<2>
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100x10xi32>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xi32>>, !fir.ref<!fir.array<100x10xi32>>)
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 9 : index
+! CHECK: %[[CONSTANT_5:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_3]] to %[[CONSTANT_4]] step %[[CONSTANT_5]] {
+! CHECK: %[[CONSTANT_6:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_7:.*]] = arith.constant 99 : index
+! CHECK: %[[CONSTANT_8:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_7]] step %[[CONSTANT_8]] {
+! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_2]], %[[VAL_1]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
+! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32>
+! CHECK: }
+! CHECK: }
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100x10xi32>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100x10xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 10 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]], %[[CONSTANT_1]] : (index, index) -> !fir.shape<2>
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 100 : index
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 10 : index
+! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_2]], %[[CONSTANT_3]] : (index, index) -> !fir.shape<2>
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_1]] step %[[CONSTANT_4]] unordered {
+! CHECK: fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_0]] step %[[CONSTANT_4]] unordered {
+! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
+! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
+! CHECK: %[[CMPI_0:.*]] = arith.cmpi sgt, %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
+! CHECK: }
+! CHECK: }
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100x10xi32>>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_i32 : !fir.ref<i32> reduction_operator <max> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant -2147483648 : i32
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
+! CHECK: %[[CMPI_0:.*]] = arith.cmpi sgt, %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: hlfir.assign %[[SELECT_0]] to %[[VAL_0]] : i32, !fir.ref<i32>
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<i32>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_100x10xf32 : !fir.ref<!fir.array<100x10xf32>> reduction_operator <min> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10xf32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 3.40282347E+38 : f32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 10 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]], %[[CONSTANT_2]] : (index, index) -> !fir.shape<2>
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100x10xf32>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xf32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xf32>>, !fir.ref<!fir.array<100x10xf32>>)
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 9 : index
+! CHECK: %[[CONSTANT_5:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_3]] to %[[CONSTANT_4]] step %[[CONSTANT_5]] {
+! CHECK: %[[CONSTANT_6:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_7:.*]] = arith.constant 99 : index
+! CHECK: %[[CONSTANT_8:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_7]] step %[[CONSTANT_8]] {
+! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_2]], %[[VAL_1]] : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32>
+! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<f32>
+! CHECK: }
+! CHECK: }
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100x10xf32>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10xf32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100x10xf32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 10 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]], %[[CONSTANT_1]] : (index, index) -> !fir.shape<2>
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 100 : index
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 10 : index
+! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_2]], %[[CONSTANT_3]] : (index, index) -> !fir.shape<2>
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_1]] step %[[CONSTANT_4]] unordered {
+! CHECK: fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_0]] step %[[CONSTANT_4]] unordered {
+! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
+! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
+! CHECK: %[[CMPF_0:.*]] = arith.cmpf olt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
+! CHECK: hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
+! CHECK: }
+! CHECK: }
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100x10xf32>>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_f32 : !fir.ref<f32> reduction_operator <min> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 3.40282347E+38 : f32
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca f32
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<f32>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<f32>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>, %[[VAL_1:.*]]: !fir.ref<f32>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<f32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<f32>
+! CHECK: %[[CMPF_0:.*]] = arith.cmpf olt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
+! CHECK: hlfir.assign %[[SELECT_0]] to %[[VAL_0]] : f32, !fir.ref<f32>
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<f32>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <min> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 2147483647 : i32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xi32>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 99 : index
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] {
+! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32>
+! CHECK: }
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xi32>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]] : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
+! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
+! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
+! CHECK: %[[CMPI_0:.*]] = arith.cmpi slt, %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
+! CHECK: }
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xi32>>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_i32 : !fir.ref<i32> reduction_operator <min> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 2147483647 : i32
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
+! CHECK: %[[CMPI_0:.*]] = arith.cmpi slt, %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: hlfir.assign %[[SELECT_0]] to %[[VAL_0]] : i32, !fir.ref<i32>
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<i32>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <mul> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xf32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xf32>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 99 : index
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] {
+! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<f32>
+! CHECK: }
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xf32>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xf32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xf32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]] : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
+! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
+! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
+! CHECK: %[[MULF_0:.*]] = arith.mulf %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK: hlfir.assign %[[MULF_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
+! CHECK: }
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xf32>>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_f32 : !fir.ref<f32> reduction_operator <mul> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca f32
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<f32>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<f32>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>, %[[VAL_1:.*]]: !fir.ref<f32>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<f32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<f32>
+! CHECK: %[[MULF_0:.*]] = arith.mulf %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK: hlfir.assign %[[MULF_0]] to %[[VAL_0]] : f32, !fir.ref<f32>
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<f32>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <mul> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 1 : i32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xi32>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 99 : index
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] {
+! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32>
+! CHECK: }
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xi32>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]] : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
+! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
+! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
+! CHECK: %[[MULI_0:.*]] = arith.muli %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: hlfir.assign %[[MULI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
+! CHECK: }
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xi32>>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_i32 : !fir.ref<i32> reduction_operator <mul> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 1 : i32
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
+! CHECK: %[[MULI_0:.*]] = arith.muli %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: hlfir.assign %[[MULI_0]] to %[[VAL_0]] : i32, !fir.ref<i32>
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<i32>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <add> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xf32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xf32>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 99 : index
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] {
+! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<f32>
+! CHECK: }
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xf32>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xf32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xf32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]] : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
+! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
+! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
+! CHECK: %[[ADDF_0:.*]] = arith.addf %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK: hlfir.assign %[[ADDF_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
+! CHECK: }
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xf32>>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_f32 : !fir.ref<f32> reduction_operator <add> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca f32
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<f32>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<f32>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>, %[[VAL_1:.*]]: !fir.ref<f32>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<f32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<f32>
+! CHECK: %[[ADDF_0:.*]] = arith.addf %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK: hlfir.assign %[[ADDF_0]] to %[[VAL_0]] : f32, !fir.ref<f32>
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<f32>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100x10x2xi32 : !fir.ref<!fir.array<100x10x2xi32>> reduction_operator <add> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10x2xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 10 : index
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 2 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]], %[[CONSTANT_2]], %[[CONSTANT_3]] : (index, index, index) -> !fir.shape<3>
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100x10x2xi32>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10x2xi32>>, !fir.shape<3>) -> (!fir.ref<!fir.array<100x10x2xi32>>, !fir.ref<!fir.array<100x10x2xi32>>)
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_5:.*]] = arith.constant 1 : index
+! CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_5]] step %[[CONSTANT_6]] {
+! CHECK: %[[CONSTANT_7:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_8:.*]] = arith.constant 9 : index
+! CHECK: %[[CONSTANT_9:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_7]] to %[[CONSTANT_8]] step %[[CONSTANT_9]] {
+! CHECK: %[[CONSTANT_10:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_11:.*]] = arith.constant 99 : index
+! CHECK: %[[CONSTANT_12:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_10]] to %[[CONSTANT_11]] step %[[CONSTANT_12]] {
+! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_3]], %[[VAL_2]], %[[VAL_1]] : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32>
+! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32>
+! CHECK: }
+! CHECK: }
+! CHECK: }
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100x10x2xi32>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10x2xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100x10x2xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 10 : index
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 2 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]], %[[CONSTANT_1]], %[[CONSTANT_2]] : (index, index, index) -> !fir.shape<3>
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 100 : index
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 10 : index
+! CHECK: %[[CONSTANT_5:.*]] = arith.constant 2 : index
+! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_3]], %[[CONSTANT_4]], %[[CONSTANT_5]] : (index, index, index) -> !fir.shape<3>
+! CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_2]] step %[[CONSTANT_6]] unordered {
+! CHECK: fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_1]] step %[[CONSTANT_6]] unordered {
+! CHECK: fir.do_loop %[[VAL_4:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_0]] step %[[CONSTANT_6]] unordered {
+! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_4]], %[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
+! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_4]], %[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
+! CHECK: %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: hlfir.assign %[[ADDI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
+! CHECK: }
+! CHECK: }
+! CHECK: }
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100x10x2xi32>>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <add> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 10 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]], %[[CONSTANT_2]] : (index, index) -> !fir.shape<2>
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100x10xi32>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xi32>>, !fir.ref<!fir.array<100x10xi32>>)
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 9 : index
+! CHECK: %[[CONSTANT_5:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_3]] to %[[CONSTANT_4]] step %[[CONSTANT_5]] {
+! CHECK: %[[CONSTANT_6:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_7:.*]] = arith.constant 99 : index
+! CHECK: %[[CONSTANT_8:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_7]] step %[[CONSTANT_8]] {
+! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_2]], %[[VAL_1]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
+! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32>
+! CHECK: }
+! CHECK: }
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100x10xi32>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100x10xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 10 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]], %[[CONSTANT_1]] : (index, index) -> !fir.shape<2>
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 100 : index
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 10 : index
+! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_2]], %[[CONSTANT_3]] : (index, index) -> !fir.shape<2>
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_1]] step %[[CONSTANT_4]] unordered {
+! CHECK: fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_0]] step %[[CONSTANT_4]] unordered {
+! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
+! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
+! CHECK: %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: hlfir.assign %[[ADDI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
+! CHECK: }
+! CHECK: }
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100x10xi32>>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <add> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xi32>
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : index
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 99 : index
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] {
+! CHECK: %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK: fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32>
+! CHECK: }
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xi32>>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xi32>>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]] : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK: %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
+! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
+! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
+! CHECK: %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: hlfir.assign %[[ADDI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
+! CHECK: }
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xi32>>
+! CHECK: }
+
+! CHECK-LABEL: acc.private.recipe @privatization_ref_i32 : !fir.ref<i32> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.private.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_i32 : !fir.ref<i32> reduction_operator <add> init {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32>
+! CHECK: acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32>
+
+! CHECK-LABEL: } combiner {
+! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>):
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
+! CHECK: %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: hlfir.assign %[[ADDI_0]] to %[[VAL_0]] : i32, !fir.ref<i32>
+! CHECK: acc.yield %[[VAL_0]] : !fir.ref<i32>
+! CHECK: }
subroutine acc_reduction_add_int(a, b)
integer :: a(100)
diff --git a/flang/test/Transforms/OpenACC/acc-implicit-copy-reduction.fir b/flang/test/Transforms/OpenACC/acc-implicit-copy-reduction.fir
new file mode 100644
index 0000000..d0fc5b7
--- /dev/null
+++ b/flang/test/Transforms/OpenACC/acc-implicit-copy-reduction.fir
@@ -0,0 +1,134 @@
+// RUN: fir-opt %s --pass-pipeline="builtin.module(acc-initialize-fir-analyses,acc-implicit-data{enable-implicit-reduction-copy=true})" -split-input-file | FileCheck %s --check-prefix=COPY
+// RUN: fir-opt %s --pass-pipeline="builtin.module(acc-initialize-fir-analyses,acc-implicit-data{enable-implicit-reduction-copy=false})" -split-input-file | FileCheck %s --check-prefix=FIRSTPRIVATE
+
+// Test case: integer reduction in parallel loop
+// This corresponds to Fortran code:
+// integer :: r, i
+// r = 0
+// !$acc parallel
+// !$acc loop gang reduction(+:r)
+// do i = 1, N
+// r = r + 1
+// enddo
+// !$acc end parallel
+
+acc.reduction.recipe @reduction_add_ref_i32 : !fir.ref<i32> reduction_operator <add> init {
+^bb0(%arg0: !fir.ref<i32>):
+ %c0_i32 = arith.constant 0 : i32
+ %0 = fir.alloca i32
+ %1 = fir.declare %0 {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ fir.store %c0_i32 to %1 : !fir.ref<i32>
+ acc.yield %1 : !fir.ref<i32>
+} combiner {
+^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<i32>):
+ %0 = fir.load %arg0 : !fir.ref<i32>
+ %1 = fir.load %arg1 : !fir.ref<i32>
+ %2 = arith.addi %0, %1 : i32
+ fir.store %2 to %arg0 : !fir.ref<i32>
+ acc.yield %arg0 : !fir.ref<i32>
+}
+
+func.func @test_reduction_implicit_copy() {
+ %c1_i32 = arith.constant 1 : i32
+ %cN = arith.constant 100 : i32
+ %r = fir.alloca i32 {bindc_name = "r", uniq_name = "_QFEr"}
+ %i = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+ %r_decl = fir.declare %r {uniq_name = "_QFEr"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ %i_decl = fir.declare %i {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ %c0_i32 = arith.constant 0 : i32
+ fir.store %c0_i32 to %r_decl : !fir.ref<i32>
+
+ acc.parallel {
+ %red_var = acc.reduction varPtr(%r_decl : !fir.ref<i32>) -> !fir.ref<i32> {name = "r"}
+ acc.loop reduction(@reduction_add_ref_i32 -> %red_var : !fir.ref<i32>) control(%iv : i32) = (%c1_i32 : i32) to (%cN : i32) step (%c1_i32 : i32) {
+ fir.store %iv to %i_decl : !fir.ref<i32>
+ %cur_r = fir.load %red_var : !fir.ref<i32>
+ %new_r = arith.addi %cur_r, %c1_i32 : i32
+ fir.store %new_r to %red_var : !fir.ref<i32>
+ acc.yield
+ } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+ acc.yield
+ }
+ return
+}
+
+// When enable-implicit-reduction-copy=true: expect copyin/copyout for reduction variable
+// COPY: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {dataClause = #acc<data_clause acc_reduction>, implicit = true, name = "r"}
+// COPY: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<i32>) to varPtr({{.*}} : !fir.ref<i32>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "r"}
+
+// When enable-implicit-reduction-copy=false: expect firstprivate for reduction variable
+// FIRSTPRIVATE: acc.firstprivate varPtr({{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "r"}
+// FIRSTPRIVATE-NOT: acc.copyin
+// FIRSTPRIVATE-NOT: acc.copyout
+
+// -----
+
+// Test case: reduction variable used both in loop and outside (should be firstprivate)
+// This corresponds to Fortran code:
+// integer :: r = 0, i, out
+// !$acc parallel num_gangs(1)
+// !$acc loop reduction(+:r) copyout(out)
+// do i = 1, N
+// r = r + 1
+// enddo
+// out = r
+// !$acc end parallel
+
+acc.reduction.recipe @reduction_add_ref_i32 : !fir.ref<i32> reduction_operator <add> init {
+^bb0(%arg0: !fir.ref<i32>):
+ %c0_i32 = arith.constant 0 : i32
+ %0 = fir.alloca i32
+ %1 = fir.declare %0 {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ fir.store %c0_i32 to %1 : !fir.ref<i32>
+ acc.yield %1 : !fir.ref<i32>
+} combiner {
+^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<i32>):
+ %0 = fir.load %arg0 : !fir.ref<i32>
+ %1 = fir.load %arg1 : !fir.ref<i32>
+ %2 = arith.addi %0, %1 : i32
+ fir.store %2 to %arg0 : !fir.ref<i32>
+ acc.yield %arg0 : !fir.ref<i32>
+}
+
+func.func @test_reduction_with_usage_outside_loop() {
+ %c1_i32 = arith.constant 1 : i32
+ %cN = arith.constant 100 : i32
+ %c0_i32 = arith.constant 0 : i32
+
+ %r = fir.alloca i32 {bindc_name = "r", uniq_name = "_QFEr"}
+ %i = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+ %out = fir.alloca i32 {bindc_name = "out", uniq_name = "_QFEout"}
+
+ %r_decl = fir.declare %r {uniq_name = "_QFEr"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ %i_decl = fir.declare %i {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ %out_decl = fir.declare %out {uniq_name = "_QFEout"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ fir.store %c0_i32 to %r_decl : !fir.ref<i32>
+
+ %out_copyout = acc.create varPtr(%out_decl : !fir.ref<i32>) -> !fir.ref<i32> {dataClause = #acc<data_clause acc_copyout>, name = "out"}
+ acc.parallel dataOperands(%out_copyout : !fir.ref<i32>) {
+ %red_var = acc.reduction varPtr(%r_decl : !fir.ref<i32>) -> !fir.ref<i32> {name = "r"}
+ acc.loop reduction(@reduction_add_ref_i32 -> %red_var : !fir.ref<i32>) control(%iv : i32) = (%c1_i32 : i32) to (%cN : i32) step (%c1_i32 : i32) {
+ fir.store %iv to %i_decl : !fir.ref<i32>
+ %cur_r = fir.load %red_var : !fir.ref<i32>
+ %new_r = arith.addi %cur_r, %c1_i32 : i32
+ fir.store %new_r to %red_var : !fir.ref<i32>
+ acc.yield
+ } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+ // out = r (usage of r outside the loop)
+ %final_r = fir.load %r_decl : !fir.ref<i32>
+ fir.store %final_r to %out_copyout : !fir.ref<i32>
+ acc.yield
+ }
+ acc.copyout accPtr(%out_copyout : !fir.ref<i32>) to varPtr(%out_decl : !fir.ref<i32>) {dataClause = #acc<data_clause acc_copyout>, name = "out"}
+ return
+}
+
+// In this case, r should be firstprivate regardless of the flag setting because it's used outside the reduction context
+// COPY-LABEL: func.func @test_reduction_with_usage_outside_loop
+// COPY: acc.firstprivate varPtr({{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "r"}
+// COPY-NOT: acc.copyin varPtr({{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {{.*}} name = "r"
+
+// FIRSTPRIVATE-LABEL: func.func @test_reduction_with_usage_outside_loop
+// FIRSTPRIVATE: acc.firstprivate varPtr({{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "r"}
+// FIRSTPRIVATE-NOT: acc.copyin varPtr({{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {{.*}} name = "r"
+
diff --git a/flang/test/Transforms/OpenACC/acc-implicit-data-derived-type-member.F90 b/flang/test/Transforms/OpenACC/acc-implicit-data-derived-type-member.F90
new file mode 100644
index 0000000..71e7d79
--- /dev/null
+++ b/flang/test/Transforms/OpenACC/acc-implicit-data-derived-type-member.F90
@@ -0,0 +1,38 @@
+!RUN: rm -rf %t && mkdir %t && cd %t && \
+!RUN: bbc %s -fopenacc -emit-hlfir -o - \
+!RUN: | fir-opt --pass-pipeline="builtin.module(acc-initialize-fir-analyses,acc-implicit-data)" \
+!RUN: | FileCheck %s
+
+! This test exercises whether the ACCImplicitData pass inserts its new
+! data operations in appropriate position so that parents are copied in before
+! their children.
+
+module types
+ type derivc8r4
+ complex(8) :: member0
+ real(4) :: member1
+ end type derivc8r4
+end module
+program test
+ use types
+ implicit none
+ type (derivc8r4) :: d2
+ type (derivc8r4) :: d4
+ integer(4) :: i0
+ d2%member0 = 123
+ !$acc serial copyin(d2%member0) copyout(d4%member0)
+ do i0 = 1, 1
+ d4%member0 = d2%member0
+ end do
+ !$acc end serial
+end program
+
+!CHECK: acc.copyin {{.*}} {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "d2"}
+!CHECK: acc.copyin {{.*}} {name = "d2%member0"}
+!CHECK: acc.copyin {{.*}} {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "d4"}
+!CHECK: acc.create {{.*}} {dataClause = #acc<data_clause acc_copyout>, name = "d4%member0"}
+!CHECK: acc.delete {{.*}} {dataClause = #acc<data_clause acc_copyin>, name = "d2%member0"}
+!CHECK: acc.copyout {{.*}} {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "d2"}
+!CHECK: acc.copyout {{.*}} {name = "d4%member0"}
+!CHECK: acc.copyout {{.*}} {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "d4"}
+
diff --git a/flang/test/Transforms/OpenACC/acc-implicit-data-fortran.F90 b/flang/test/Transforms/OpenACC/acc-implicit-data-fortran.F90
new file mode 100644
index 0000000..228aba1
--- /dev/null
+++ b/flang/test/Transforms/OpenACC/acc-implicit-data-fortran.F90
@@ -0,0 +1,79 @@
+!RUN: rm -rf %t && mkdir %t && cd %t && \
+!RUN: bbc %s -fopenacc -emit-hlfir -o - \
+!RUN: | fir-opt --pass-pipeline="builtin.module(acc-initialize-fir-analyses,acc-implicit-data)" \
+!RUN: | FileCheck %s --check-prefix=CHECKHLFIR
+
+!RUN: rm -rf %t && mkdir %t && cd %t && \
+!RUN: bbc %s -fopenacc -emit-hlfir -o - \
+!RUN: | fir-opt --pass-pipeline="builtin.module(cse,acc-initialize-fir-analyses,acc-implicit-data)" \
+!RUN: | FileCheck %s --check-prefix=CHECKCSE
+
+!RUN: rm -rf %t && mkdir %t && cd %t && \
+!RUN: bbc %s -fopenacc -emit-fir -o - \
+!RUN: | fir-opt --pass-pipeline="builtin.module(cse,acc-initialize-fir-analyses,acc-implicit-data)" \
+!RUN: | FileCheck %s --check-prefix=CHECKCSE
+
+! This test uses bbc to generate both HLFIR and FIR for this test. The intent is
+! that it is exercising the acc implicit data pipeline and ensures that
+! correct clauses are generated. It also runs CSE which eliminates redundant
+! interior pointer computations (and thus different live-ins are found).
+
+program main
+ type aggr
+ real :: field
+ end type
+ type nested
+ type(aggr) :: outer
+ end type
+ type(aggr) :: aggrvar
+ type(nested) :: nestaggrvar
+ real :: scalarvar
+ real :: arrayvar(10)
+ complex :: scalarcomp
+
+ aggrvar%field = 1
+ scalarvar = aggrvar%field
+ nestaggrvar%outer%field = scalarvar
+ scalarcomp = scalarvar
+ arrayvar = real(scalarcomp)
+ arrayvar(2) = aggrvar%field
+
+ !$acc kernels
+ arrayvar = aggrvar%field + scalarvar + nestaggrvar%outer%field + real(scalarcomp) + arrayvar(2)
+ !$acc end kernels
+
+ !$acc parallel
+ arrayvar = aggrvar%field + scalarvar + nestaggrvar%outer%field + real(scalarcomp) + arrayvar(2)
+ !$acc end parallel
+end program
+
+!CHECKHLFIR-LABEL: @_QQmain
+!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar"}
+!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"}
+!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.type<_QFTnested{outer:!fir.type<_QFTaggr{field:f32}>}>>) -> !fir.ref<!fir.type<_QFTnested{outer:!fir.type<_QFTaggr{field:f32}>}>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "nestaggrvar"}
+!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<complex<f32>>) -> !fir.ref<complex<f32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarcomp"}
+!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarvar"}
+!CHECKHLFIR: acc.kernels
+!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar"}
+!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"}
+!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.type<_QFTnested{outer:!fir.type<_QFTaggr{field:f32}>}>>) -> !fir.ref<!fir.type<_QFTnested{outer:!fir.type<_QFTaggr{field:f32}>}>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "nestaggrvar"}
+!CHECKHLFIR-DAG: acc.firstprivate varPtr(%{{.*}} : !fir.ref<complex<f32>>) -> !fir.ref<complex<f32>> {implicit = true, name = "scalarcomp"}
+!CHECKHLFIR-DAG: acc.firstprivate varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {implicit = true, name = "scalarvar"}
+!CHECKHLFIR: acc.parallel
+
+!CHECKCSE-LABEL: @_QQmain
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"}
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<complex<f32>>) -> !fir.ref<complex<f32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarcomp"}
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarvar"}
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar%field"}
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "nestaggrvar%outer%field"}
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar(2)"}
+!CHECKCSE: acc.kernels
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"}
+!CHECKCSE-DAG: acc.firstprivate varPtr(%{{.*}} : !fir.ref<complex<f32>>) -> !fir.ref<complex<f32>> {implicit = true, name = "scalarcomp"}
+!CHECKCSE-DAG: acc.firstprivate varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {implicit = true, name = "scalarvar"}
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar%field"}
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "nestaggrvar%outer%field"}
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar(2)"}
+!CHECKCSE: acc.parallel
+
diff --git a/flang/test/Transforms/OpenACC/acc-implicit-data.fir b/flang/test/Transforms/OpenACC/acc-implicit-data.fir
new file mode 100644
index 0000000..7f6a57c
--- /dev/null
+++ b/flang/test/Transforms/OpenACC/acc-implicit-data.fir
@@ -0,0 +1,358 @@
+// RUN: fir-opt %s --pass-pipeline="builtin.module(acc-initialize-fir-analyses,acc-implicit-data)" -split-input-file | FileCheck %s
+
+// -----
+
+func.func @test_fir_scalar_in_serial() {
+ %livein = fir.alloca i64 {bindc_name = "scalarvar"}
+ acc.serial {
+ %load = fir.load %livein : !fir.ref<i64>
+ acc.yield
+ }
+ return
+}
+
+// CHECK: acc.firstprivate varPtr({{.*}} : !fir.ref<i64>) -> !fir.ref<i64> {implicit = true, name = "scalarvar"}
+
+// -----
+
+func.func @test_fir_scalar_in_parallel() {
+ %livein = fir.alloca f32 {bindc_name = "scalarvar"}
+ acc.parallel {
+ %load = fir.load %livein : !fir.ref<f32>
+ acc.yield
+ }
+ return
+}
+
+// CHECK: acc.firstprivate varPtr({{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {implicit = true, name = "scalarvar"}
+
+// -----
+
+func.func @test_fir_scalar_in_kernels() {
+ %livein = fir.alloca f64 {bindc_name = "scalarvar"}
+ acc.kernels {
+ %load = fir.load %livein : !fir.ref<f64>
+ acc.terminator
+ }
+ return
+}
+
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<f64>) -> !fir.ref<f64> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarvar"}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<f64>) to varPtr({{.*}} : !fir.ref<f64>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarvar"}
+
+// -----
+
+func.func @test_fir_scalar_in_parallel_defaultnone() {
+ %livein = fir.alloca f32 {bindc_name = "scalarvar"}
+ acc.parallel {
+ %load = fir.load %livein : !fir.ref<f32>
+ acc.yield
+ } attributes {defaultAttr = #acc<defaultvalue none>}
+ return
+}
+
+// CHECK-NOT: acc.firstprivate
+
+// -----
+
+func.func @test_fir_scalar_in_kernels_defaultnone() {
+ %livein = fir.alloca f64 {bindc_name = "scalarvar"}
+ acc.kernels {
+ %load = fir.load %livein : !fir.ref<f64>
+ acc.terminator
+ } attributes {defaultAttr = #acc<defaultvalue none>}
+ return
+}
+
+// CHECK-NOT: acc.copyin
+
+// -----
+
+func.func @test_fir_derivedtype_in_parallel() {
+ %livein = fir.alloca !fir.type<_QFTaggr{field:f32}> {bindc_name = "aggrvar"}
+ acc.parallel {
+ %load = fir.load %livein : !fir.ref<!fir.type<_QFTaggr{field:f32}>>
+ acc.yield
+ }
+ return
+}
+
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar"}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) to varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar"}
+
+// -----
+
+func.func @test_fir_derivedtype_in_kernels() {
+ %livein = fir.alloca !fir.type<_QFTaggr{field:f32}> {bindc_name = "aggrvar"}
+ acc.kernels {
+ %load = fir.load %livein : !fir.ref<!fir.type<_QFTaggr{field:f32}>>
+ acc.terminator
+ }
+ return
+}
+
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar"}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) to varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar"}
+
+// -----
+
+func.func @test_fir_array_in_parallel() {
+ %livein = fir.alloca !fir.array<10xf32> {bindc_name = "arrayvar"}
+ acc.parallel {
+ %load = fir.load %livein : !fir.ref<!fir.array<10xf32>>
+ acc.yield
+ }
+ return
+}
+
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<!fir.array<10xf32>>) to varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"}
+
+// -----
+
+func.func @test_fir_array_in_kernels() {
+ %livein = fir.alloca !fir.array<10xf32> {bindc_name = "arrayvar"}
+ acc.kernels {
+ %load = fir.load %livein : !fir.ref<!fir.array<10xf32>>
+ acc.terminator
+ }
+ return
+}
+
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<!fir.array<10xf32>>) to varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"}
+
+// -----
+
+func.func @test_fir_derivedtype_in_parallel_defaultpresent() {
+ %livein = fir.alloca !fir.type<_QFTaggr{field:f32}> {bindc_name = "aggrvar"}
+ acc.parallel {
+ %load = fir.load %livein : !fir.ref<!fir.type<_QFTaggr{field:f32}>>
+ acc.yield
+ } attributes {defaultAttr = #acc<defaultvalue present>}
+ return
+}
+
+// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {implicit = true, name = "aggrvar"}
+// CHECK: acc.delete accPtr(%[[PRESENT]] : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) {dataClause = #acc<data_clause acc_present>, implicit = true, name = "aggrvar"}
+
+// -----
+
+func.func @test_fir_derivedtype_in_kernels_defaultpresent() {
+ %livein = fir.alloca !fir.type<_QFTaggr{field:f32}> {bindc_name = "aggrvar"}
+ acc.kernels {
+ %load = fir.load %livein : !fir.ref<!fir.type<_QFTaggr{field:f32}>>
+ acc.terminator
+ } attributes {defaultAttr = #acc<defaultvalue present>}
+ return
+}
+
+// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {implicit = true, name = "aggrvar"}
+// CHECK: acc.delete accPtr(%[[PRESENT]] : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) {dataClause = #acc<data_clause acc_present>, implicit = true, name = "aggrvar"}
+
+// -----
+
+func.func @test_fir_array_in_parallel_defaultpresent() {
+ %livein = fir.alloca !fir.array<10xf32> {bindc_name = "arrayvar"}
+ acc.parallel {
+ %load = fir.load %livein : !fir.ref<!fir.array<10xf32>>
+ acc.yield
+ } attributes {defaultAttr = #acc<defaultvalue present>}
+ return
+}
+
+// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {implicit = true, name = "arrayvar"}
+// CHECK: acc.delete accPtr(%[[PRESENT]] : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_present>, implicit = true, name = "arrayvar"}
+
+// -----
+
+func.func @test_fir_array_in_kernels_defaultpresent() {
+ %livein = fir.alloca !fir.array<10xf32> {bindc_name = "arrayvar"}
+ acc.kernels {
+ %load = fir.load %livein : !fir.ref<!fir.array<10xf32>>
+ acc.terminator
+ } attributes {defaultAttr = #acc<defaultvalue present>}
+ return
+}
+
+// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {implicit = true, name = "arrayvar"}
+// CHECK: acc.delete accPtr(%[[PRESENT]] : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_present>, implicit = true, name = "arrayvar"}
+
+// -----
+
+func.func @test_fir_scalar_in_parallel_defaultpresent() {
+ %livein = fir.alloca f32 {bindc_name = "scalarvar"}
+ acc.parallel {
+ %load = fir.load %livein : !fir.ref<f32>
+ acc.yield
+ } attributes {defaultAttr = #acc<defaultvalue present>}
+ return
+}
+
+// CHECK: acc.firstprivate varPtr({{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {implicit = true, name = "scalarvar"}
+
+// -----
+
+func.func @test_fir_scalar_in_kernels_defaultpresent() {
+ %livein = fir.alloca f64 {bindc_name = "scalarvar"}
+ acc.kernels {
+ %load = fir.load %livein : !fir.ref<f64>
+ acc.terminator
+ } attributes {defaultAttr = #acc<defaultvalue present>}
+ return
+}
+
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<f64>) -> !fir.ref<f64> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarvar"}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<f64>) to varPtr({{.*}} : !fir.ref<f64>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarvar"}
+
+// -----
+
+func.func @test_fir_box_ref() {
+ %livein = fir.alloca !fir.box<!fir.array<?xi32>> {bindc_name = "descriptor"}
+ acc.parallel {
+ %load = fir.load %livein : !fir.ref<!fir.box<!fir.array<?xi32>>>
+ acc.yield
+ }
+ return
+}
+
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.array<?xi32>>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "descriptor"}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<!fir.box<!fir.array<?xi32>>>) to varPtr({{.*}} : !fir.ref<!fir.box<!fir.array<?xi32>>>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "descriptor"}
+
+// -----
+
+func.func @test_fir_box_val() {
+ %desc = fir.alloca !fir.box<!fir.array<?xi32>> {bindc_name = "descriptor"}
+ %livein = fir.load %desc : !fir.ref<!fir.box<!fir.array<?xi32>>>
+ acc.parallel {
+ %addr = fir.box_addr %livein : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+ acc.yield
+ }
+ return
+}
+
+// CHECK: %[[COPYIN:.*]] = acc.copyin var({{.*}} : !fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "descriptor"}
+// CHECK: acc.copyout accVar(%[[COPYIN]] : !fir.box<!fir.array<?xi32>>) to var({{.*}} : !fir.box<!fir.array<?xi32>>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "descriptor"}
+
+
+// -----
+
+// This test has an explicit data clause for the box - but the pointer held
+// inside the box is used in the region instead of the box itself. Test that
+// implicit present is actually used.
+func.func @test_explicit_box_implicit_ptr() {
+ %c1 = arith.constant 1 : index
+ %c10 = arith.constant 10 : index
+ %arr = fir.alloca !fir.array<10xf32> {bindc_name = "aa"}
+ %shape = fir.shape %c10 : (index) -> !fir.shape<1>
+ %arr_decl = fir.declare %arr(%shape) {uniq_name = "aa"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<10xf32>>
+ %box = fir.embox %arr_decl(%shape) : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<10xf32>>
+ %copyin = acc.copyin var(%box : !fir.box<!fir.array<10xf32>>) -> !fir.box<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "aa"}
+ acc.serial dataOperands(%copyin : !fir.box<!fir.array<10xf32>>) {
+ // Use the pointer, not the box
+ %elem = fir.array_coor %arr_decl(%shape) %c1 : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
+ acc.yield
+ }
+ acc.copyout accVar(%copyin : !fir.box<!fir.array<10xf32>>) to var(%box : !fir.box<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_copy>, name = "aa"}
+ return
+}
+
+// CHECK: acc.present varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>){{.*}}-> !fir.ref<!fir.array<10xf32>> {implicit = true, name = "aa"}
+
+// -----
+
+// This test uses an explicit-shape array with no data clause - it also has
+// an optimization where the pointer is used instead of the boxed entity.
+// It tests that the implicit data pass is able to recover the size despite
+// it not being encoded in the FIR type.
+// It was generated from the following Fortran source:
+// subroutine array(aa,nn)
+// integer :: nn
+// real :: aa(10:nn)
+// !$acc kernels loop
+// do ii = 10, nn
+// aa(ii) = ii
+// end do
+// !$acc end kernels
+// end subroutine
+
+func.func @_QParray(%arg0: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "aa"}, %arg1: !fir.ref<i32> {fir.bindc_name = "nn"}) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c10_i64 = arith.constant 10 : i64
+ %0 = fir.dummy_scope : !fir.dscope
+ %1 = fir.declare %arg1 dummy_scope %0 {uniq_name = "_QFarrayEnn"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+ %4 = fir.convert %c10_i64 : (i64) -> index
+ %5 = fir.load %1 : !fir.ref<i32>
+ %6 = fir.convert %5 : (i32) -> i64
+ %7 = fir.convert %6 : (i64) -> index
+ %8 = arith.subi %7, %4 : index
+ %9 = arith.addi %8, %c1 : index
+ %10 = arith.cmpi sgt, %9, %c0 : index
+ %11 = arith.select %10, %9, %c0 : index
+ %12 = fir.shape_shift %4, %11 : (index, index) -> !fir.shapeshift<1>
+ %13 = fir.declare %arg0(%12) dummy_scope %0 {uniq_name = "_QFarrayEaa"} : (!fir.ref<!fir.array<?xf32>>, !fir.shapeshift<1>, !fir.dscope) -> !fir.ref<!fir.array<?xf32>>
+ acc.kernels {
+ %elem = fir.array_coor %13(%12) %4 : (!fir.ref<!fir.array<?xf32>>, !fir.shapeshift<1>, index) -> !fir.ref<f32>
+ acc.terminator
+ }
+ return
+}
+
+// This tries to confirm that the acc.bounds operation is as expected.
+// Effectively the extent needs to be max(0, nn), stride needs to be 1,
+// adjusted lowerbound is 0, and actual language start index is 10.
+// CHECK: %[[NN:.*]] = fir.declare %{{.*}} dummy_scope %{{.*}} {uniq_name = "_QFarrayEnn"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+// CHECK: %[[C10:.*]] = fir.convert %c10{{.*}} : (i64) -> index
+// CHECK: %[[LOADEDNN:.*]] = fir.load %[[NN]] : !fir.ref<i32>
+// CHECK: %[[CAST1:.*]] = fir.convert %[[LOADEDNN]] : (i32) -> i64
+// CHECK: %[[CAST2:.*]] = fir.convert %[[CAST1]] : (i64) -> index
+// CHECK: %[[SUBI:.*]] = arith.subi %[[CAST2]], %[[C10]] : index
+// CHECK: %[[ADDI:.*]] = arith.addi %[[SUBI]], %c1{{.*}} : index
+// CHECK: %[[CMPI:.*]] = arith.cmpi sgt, %[[ADDI]], %c0{{.*}} : index
+// CHECK: %[[SELECT:.*]] = arith.select %[[CMPI]], %[[ADDI]], %c0{{.*}} : index
+// CHECK: %[[BOUNDS:.*]] = acc.bounds lowerbound(%c0{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[SELECT]] : index) stride(%c1{{.*}} : index) startIdx(%[[C10]] : index)
+// CHECK: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<?xf32>>) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aa"}
+
+// -----
+
+// Test to confirm that a copyin clause is not implicitly generated for deviceptr symbol.
+func.func @test_deviceptr_no_implicit_copy() {
+ %c10 = arith.constant 10 : index
+ %arr = fir.alloca !fir.array<10xf64> {bindc_name = "a"}
+ %shape = fir.shape %c10 : (index) -> !fir.shape<1>
+ %arr_box = fir.embox %arr(%shape) : (!fir.ref<!fir.array<10xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<10xf64>>
+ %devptr = acc.deviceptr var(%arr_box : !fir.box<!fir.array<10xf64>>) -> !fir.box<!fir.array<10xf64>> {name = "a"}
+ acc.parallel dataOperands(%devptr : !fir.box<!fir.array<10xf64>>) {
+ %elem = fir.box_addr %arr_box : (!fir.box<!fir.array<10xf64>>) -> !fir.ref<!fir.array<10xf64>>
+ acc.yield
+ }
+ return
+}
+
+// CHECK-NOT: acc.copyin
+// CHECK: acc.deviceptr
+
+// -----
+
+// Test that acc.declare with deviceptr doesn't generate implicit copyin
+func.func @test_acc_declare_deviceptr() {
+ %c10 = arith.constant 10 : index
+ %arr = fir.alloca !fir.array<10xf64> {bindc_name = "a"}
+ %shape = fir.shape %c10 : (index) -> !fir.shape<1>
+ %arr_box = fir.embox %arr(%shape) : (!fir.ref<!fir.array<10xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<10xf64>>
+ %devptr = acc.deviceptr var(%arr_box : !fir.box<!fir.array<10xf64>>) -> !fir.box<!fir.array<10xf64>> {name = "a"}
+ %token = acc.declare_enter dataOperands(%devptr : !fir.box<!fir.array<10xf64>>)
+ acc.parallel {
+ %elem = fir.box_addr %arr_box : (!fir.box<!fir.array<10xf64>>) -> !fir.ref<!fir.array<10xf64>>
+ acc.yield
+ }
+ acc.declare_exit token(%token)
+ return
+}
+
+// CHECK-LABEL: func.func @test_acc_declare_deviceptr
+// CHECK: acc.deviceptr
+// CHECK-NOT: acc.copyin
+// CHECK: acc.deviceptr
+
diff --git a/flang/test/Transforms/OpenACC/acc-implicit-firstprivate.fir b/flang/test/Transforms/OpenACC/acc-implicit-firstprivate.fir
new file mode 100644
index 0000000..e4a7b8b
--- /dev/null
+++ b/flang/test/Transforms/OpenACC/acc-implicit-firstprivate.fir
@@ -0,0 +1,284 @@
+// RUN: fir-opt %s --pass-pipeline="builtin.module(acc-initialize-fir-analyses,acc-implicit-data)" -split-input-file | FileCheck %s
+
+// Test implicit firstprivate behavior for various scalar types in parallel and serial constructs.
+// Scalars in parallel/serial constructs should be implicitly firstprivate according to OpenACC spec.
+
+// -----
+
+// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_i32 : !fir.ref<i32> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
+// CHECK: %[[ALLOC:.*]] = fir.alloca i32
+// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
+// CHECK: acc.yield %[[DECL]]#0 : !fir.ref<i32>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<i32>, %[[DST:.*]]: !fir.ref<i32>):
+// CHECK: %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<i32>
+// CHECK: fir.store %[[LOADED]] to %[[DST]] : !fir.ref<i32>
+// CHECK: acc.terminator
+// CHECK: }
+
+// CHECK-LABEL: func.func @test_i32_scalar_in_parallel
+func.func @test_i32_scalar_in_parallel() {
+ %scalar = fir.alloca i32 {bindc_name = "i32_var"}
+ acc.parallel {
+ %load = fir.load %scalar : !fir.ref<i32>
+ acc.yield
+ }
+ return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i32_var"}
+// CHECK: acc.parallel firstprivate(@firstprivatization_ref_i32 -> %[[FIRSTPRIV]] : !fir.ref<i32>)
+
+// -----
+
+// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_i64 : !fir.ref<i64> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<i64>):
+// CHECK: %[[ALLOC:.*]] = fir.alloca i64
+// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
+// CHECK: acc.yield %[[DECL]]#0 : !fir.ref<i64>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<i64>, %[[DST:.*]]: !fir.ref<i64>):
+// CHECK: %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<i64>
+// CHECK: fir.store %[[LOADED]] to %[[DST]] : !fir.ref<i64>
+// CHECK: acc.terminator
+// CHECK: }
+
+// CHECK-LABEL: func.func @test_i64_scalar_in_parallel
+func.func @test_i64_scalar_in_parallel() {
+ %scalar = fir.alloca i64 {bindc_name = "i64_var"}
+ acc.parallel {
+ %load = fir.load %scalar : !fir.ref<i64>
+ acc.yield
+ }
+ return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<i64>) -> !fir.ref<i64> {implicit = true, name = "i64_var"}
+// CHECK: acc.parallel firstprivate(@firstprivatization_ref_i64 -> %[[FIRSTPRIV]] : !fir.ref<i64>)
+
+// -----
+
+// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_f32 : !fir.ref<f32> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<f32>):
+// CHECK: %[[ALLOC:.*]] = fir.alloca f32
+// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
+// CHECK: acc.yield %[[DECL]]#0 : !fir.ref<f32>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<f32>, %[[DST:.*]]: !fir.ref<f32>):
+// CHECK: %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<f32>
+// CHECK: fir.store %[[LOADED]] to %[[DST]] : !fir.ref<f32>
+// CHECK: acc.terminator
+// CHECK: }
+
+// CHECK-LABEL: func.func @test_f32_scalar_in_parallel
+func.func @test_f32_scalar_in_parallel() {
+ %scalar = fir.alloca f32 {bindc_name = "f32_var"}
+ acc.parallel {
+ %load = fir.load %scalar : !fir.ref<f32>
+ acc.yield
+ }
+ return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {implicit = true, name = "f32_var"}
+// CHECK: acc.parallel firstprivate(@firstprivatization_ref_f32 -> %[[FIRSTPRIV]] : !fir.ref<f32>)
+
+// -----
+
+// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_f64 : !fir.ref<f64> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<f64>):
+// CHECK: %[[ALLOC:.*]] = fir.alloca f64
+// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
+// CHECK: acc.yield %[[DECL]]#0 : !fir.ref<f64>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<f64>, %[[DST:.*]]: !fir.ref<f64>):
+// CHECK: %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<f64>
+// CHECK: fir.store %[[LOADED]] to %[[DST]] : !fir.ref<f64>
+// CHECK: acc.terminator
+// CHECK: }
+
+// CHECK-LABEL: func.func @test_f64_scalar_in_parallel
+func.func @test_f64_scalar_in_parallel() {
+ %scalar = fir.alloca f64 {bindc_name = "f64_var"}
+ acc.parallel {
+ %load = fir.load %scalar : !fir.ref<f64>
+ acc.yield
+ }
+ return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<f64>) -> !fir.ref<f64> {implicit = true, name = "f64_var"}
+// CHECK: acc.parallel firstprivate(@firstprivatization_ref_f64 -> %[[FIRSTPRIV]] : !fir.ref<f64>)
+
+// -----
+
+// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_l32 : !fir.ref<!fir.logical<4>> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.logical<4>>):
+// CHECK: %[[ALLOC:.*]] = fir.alloca !fir.logical<4>
+// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
+// CHECK: acc.yield %[[DECL]]#0 : !fir.ref<!fir.logical<4>>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<!fir.logical<4>>, %[[DST:.*]]: !fir.ref<!fir.logical<4>>):
+// CHECK: %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<!fir.logical<4>>
+// CHECK: fir.store %[[LOADED]] to %[[DST]] : !fir.ref<!fir.logical<4>>
+// CHECK: acc.terminator
+// CHECK: }
+
+// CHECK-LABEL: func.func @test_logical_scalar_in_parallel
+func.func @test_logical_scalar_in_parallel() {
+ %scalar = fir.alloca !fir.logical<4> {bindc_name = "logical_var"}
+ acc.parallel {
+ %load = fir.load %scalar : !fir.ref<!fir.logical<4>>
+ acc.yield
+ }
+ return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>> {implicit = true, name = "logical_var"}
+// CHECK: acc.parallel firstprivate(@firstprivatization_ref_l32 -> %[[FIRSTPRIV]] : !fir.ref<!fir.logical<4>>)
+
+// -----
+
+// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_z32 : !fir.ref<complex<f32>> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<complex<f32>>):
+// CHECK: %[[ALLOC:.*]] = fir.alloca complex<f32>
+// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
+// CHECK: acc.yield %[[DECL]]#0 : !fir.ref<complex<f32>>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<complex<f32>>, %[[DST:.*]]: !fir.ref<complex<f32>>):
+// CHECK: %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<complex<f32>>
+// CHECK: fir.store %[[LOADED]] to %[[DST]] : !fir.ref<complex<f32>>
+// CHECK: acc.terminator
+// CHECK: }
+
+// CHECK-LABEL: func.func @test_complex_scalar_in_parallel
+func.func @test_complex_scalar_in_parallel() {
+ %scalar = fir.alloca complex<f32> {bindc_name = "complex_var"}
+ acc.parallel {
+ %load = fir.load %scalar : !fir.ref<complex<f32>>
+ acc.yield
+ }
+ return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<complex<f32>>) -> !fir.ref<complex<f32>> {implicit = true, name = "complex_var"}
+// CHECK: acc.parallel firstprivate(@firstprivatization_ref_z32 -> %[[FIRSTPRIV]] : !fir.ref<complex<f32>>)
+
+// -----
+
+// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_z64 : !fir.ref<complex<f64>> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<complex<f64>>):
+// CHECK: %[[ALLOC:.*]] = fir.alloca complex<f64>
+// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
+// CHECK: acc.yield %[[DECL]]#0 : !fir.ref<complex<f64>>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<complex<f64>>, %[[DST:.*]]: !fir.ref<complex<f64>>):
+// CHECK: %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<complex<f64>>
+// CHECK: fir.store %[[LOADED]] to %[[DST]] : !fir.ref<complex<f64>>
+// CHECK: acc.terminator
+// CHECK: }
+
+// CHECK-LABEL: func.func @test_complex8_scalar_in_parallel
+func.func @test_complex8_scalar_in_parallel() {
+ %scalar = fir.alloca complex<f64> {bindc_name = "complex8_var"}
+ acc.parallel {
+ %load = fir.load %scalar : !fir.ref<complex<f64>>
+ acc.yield
+ }
+ return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<complex<f64>>) -> !fir.ref<complex<f64>> {implicit = true, name = "complex8_var"}
+// CHECK: acc.parallel firstprivate(@firstprivatization_ref_z64 -> %[[FIRSTPRIV]] : !fir.ref<complex<f64>>)
+
+// -----
+
+// Test with serial construct
+
+// CHECK-LABEL: func.func @test_i32_scalar_in_serial
+func.func @test_i32_scalar_in_serial() {
+ %scalar = fir.alloca i32 {bindc_name = "serial_i32_var"}
+ acc.serial {
+ %load = fir.load %scalar : !fir.ref<i32>
+ acc.yield
+ }
+ return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "serial_i32_var"}
+// CHECK: acc.serial firstprivate(@firstprivatization_ref_i32 -> %[[FIRSTPRIV]] : !fir.ref<i32>)
+
+// -----
+
+// Test with serial construct and f64
+
+// CHECK-LABEL: func.func @test_f64_scalar_in_serial
+func.func @test_f64_scalar_in_serial() {
+ %scalar = fir.alloca f64 {bindc_name = "serial_f64_var"}
+ acc.serial {
+ %load = fir.load %scalar : !fir.ref<f64>
+ acc.yield
+ }
+ return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<f64>) -> !fir.ref<f64> {implicit = true, name = "serial_f64_var"}
+// CHECK: acc.serial firstprivate(@firstprivatization_ref_f64 -> %[[FIRSTPRIV]] : !fir.ref<f64>)
+
+// -----
+
+// Test i8 and i16 scalar types
+
+// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_i8 : !fir.ref<i8> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<i8>):
+// CHECK: %[[ALLOC:.*]] = fir.alloca i8
+// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
+// CHECK: acc.yield %[[DECL]]#0 : !fir.ref<i8>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<i8>, %[[DST:.*]]: !fir.ref<i8>):
+// CHECK: %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<i8>
+// CHECK: fir.store %[[LOADED]] to %[[DST]] : !fir.ref<i8>
+// CHECK: acc.terminator
+// CHECK: }
+
+// CHECK-LABEL: func.func @test_i8_scalar_in_parallel
+func.func @test_i8_scalar_in_parallel() {
+ %scalar = fir.alloca i8 {bindc_name = "i8_var"}
+ acc.parallel {
+ %load = fir.load %scalar : !fir.ref<i8>
+ acc.yield
+ }
+ return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<i8>) -> !fir.ref<i8> {implicit = true, name = "i8_var"}
+// CHECK: acc.parallel firstprivate(@firstprivatization_ref_i8 -> %[[FIRSTPRIV]] : !fir.ref<i8>)
+
+// -----
+
+// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_i16 : !fir.ref<i16> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<i16>):
+// CHECK: %[[ALLOC:.*]] = fir.alloca i16
+// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
+// CHECK: acc.yield %[[DECL]]#0 : !fir.ref<i16>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<i16>, %[[DST:.*]]: !fir.ref<i16>):
+// CHECK: %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<i16>
+// CHECK: fir.store %[[LOADED]] to %[[DST]] : !fir.ref<i16>
+// CHECK: acc.terminator
+// CHECK: }
+
+// CHECK-LABEL: func.func @test_i16_scalar_in_parallel
+func.func @test_i16_scalar_in_parallel() {
+ %scalar = fir.alloca i16 {bindc_name = "i16_var"}
+ acc.parallel {
+ %load = fir.load %scalar : !fir.ref<i16>
+ acc.yield
+ }
+ return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<i16>) -> !fir.ref<i16> {implicit = true, name = "i16_var"}
+// CHECK: acc.parallel firstprivate(@firstprivatization_ref_i16 -> %[[FIRSTPRIV]] : !fir.ref<i16>)
+
diff --git a/flang/test/Transforms/debug-proc-ptr.fir b/flang/test/Transforms/debug-proc-ptr.fir
new file mode 100644
index 0000000..2963557
--- /dev/null
+++ b/flang/test/Transforms/debug-proc-ptr.fir
@@ -0,0 +1,41 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
+
+module {
+ func.func @_QQmain() attributes {fir.bindc_name = "test"} {
+ %0 = fir.alloca (!fir.ref<i32>) -> i32 {bindc_name = "fun_ptr", uniq_name = "_QFEfun_ptr"}
+ %1 = fircg.ext_declare %0 {uniq_name = "_QFEfun_ptr"} : (!fir.ref<(!fir.ref<i32>) -> i32>) -> !fir.ref<(!fir.ref<i32>) -> i32> loc(#loc1)
+
+ // Procedure pointer with no return: procedure(sub1), pointer :: sub_ptr
+ %2 = fir.alloca () -> () {bindc_name = "sub_ptr", uniq_name = "_QFEsub_ptr"}
+ %3 = fircg.ext_declare %2 {uniq_name = "_QFEsub_ptr"} : (!fir.ref<() -> ()>) -> !fir.ref<() -> ()> loc(#loc2)
+
+ // Procedure pointer with multiple args: procedure(func2), pointer :: func_ptr
+ %4 = fir.alloca (!fir.ref<i32>, !fir.ref<f64>) -> f32 {bindc_name = "func_ptr", uniq_name = "_QFEfunc_ptr"}
+ %5 = fircg.ext_declare %4 {uniq_name = "_QFEfunc_ptr"} : (!fir.ref<(!fir.ref<i32>, !fir.ref<f64>) -> f32>) -> !fir.ref<(!fir.ref<i32>, !fir.ref<f64>) -> f32> loc(#loc3)
+
+ return
+ } loc(#loc)
+}
+#loc = loc("test.f90":1:1)
+#loc1 = loc("test.f90":2:30)
+#loc2 = loc("test.f90":3:30)
+#loc3 = loc("test.f90":4:30)
+
+// CHECK-DAG: #[[INT:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
+// CHECK-DAG: #[[REAL32:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
+// CHECK-DAG: #[[REAL:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real(kind=8)", sizeInBits = 64, encoding = DW_ATE_float>
+
+// CHECK-DAG: #[[PTR_INT:.*]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type{{.*}}baseType = #[[INT]]{{.*}}>
+// CHECK-DAG: #[[PTR_REAL:.*]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type{{.*}}baseType = #[[REAL]]{{.*}}>
+
+// CHECK-DAG: #[[SUB1:.*]] = #llvm.di_subroutine_type<types = #[[INT]], #[[PTR_INT]]>
+// CHECK-DAG: #[[PTR_SUB1:.*]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type{{.*}}baseType = #[[SUB1]]{{.*}}>
+// CHECK-DAG: #llvm.di_local_variable<{{.*}}name = "fun_ptr"{{.*}}type = #[[PTR_SUB1]]{{.*}}>
+
+// CHECK-DAG: #di_subroutine_type{{.*}} = #llvm.di_subroutine_type<types = #di_null_type>
+// CHECK-DAG: #di_local_variable{{.*}} = #llvm.di_local_variable<{{.*}}name = "sub_ptr"{{.*}}type = #di_derived_type{{.*}}>
+// CHECK-DAG: #di_derived_type{{.*}} = #llvm.di_derived_type<tag = DW_TAG_pointer_type{{.*}}baseType = #di_subroutine_type{{.*}}{{.*}}>
+
+// CHECK-DAG: #[[SUB3:.*]] = #llvm.di_subroutine_type<types = #[[REAL32]], #[[PTR_INT]], #[[PTR_REAL]]>
+// CHECK-DAG: #[[PTR_SUB3:.*]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type{{.*}}baseType = #[[SUB3]]{{.*}}>
+// CHECK-DAG: #llvm.di_local_variable<{{.*}}name = "func_ptr"{{.*}}type = #[[PTR_SUB3]]{{.*}}>
diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h
index 7feef56..cbce62e 100644
--- a/libc/src/string/string_utils.h
+++ b/libc/src/string/string_utils.h
@@ -127,8 +127,8 @@ find_first_character_wide_read(const unsigned char *src, unsigned char ch,
size_t cur = 0;
// Step 1: read 1 byte at a time to align to block size
- for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0 && cur < n;
- ++char_ptr, ++cur) {
+ for (; cur < n && reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0;
+ ++cur, ++char_ptr) {
if (*char_ptr == ch)
return const_cast<unsigned char *>(char_ptr);
}
@@ -136,18 +136,18 @@ find_first_character_wide_read(const unsigned char *src, unsigned char ch,
const Word ch_mask = repeat_byte<Word>(ch);
// Step 2: read blocks
- for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
- !has_zeroes<Word>((*block_ptr) ^ ch_mask) && cur < n;
- ++block_ptr, cur += sizeof(Word)) {
- char_ptr = reinterpret_cast<const unsigned char *>(block_ptr);
- }
+ const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
+ for (; cur < n && !has_zeroes<Word>((*block_ptr) ^ ch_mask);
+ cur += sizeof(Word), ++block_ptr)
+ ;
+ char_ptr = reinterpret_cast<const unsigned char *>(block_ptr);
// Step 3: find the match in the block
- for (; *char_ptr != ch && cur < n; ++char_ptr, ++cur) {
+ for (; cur < n && *char_ptr != ch; ++cur, ++char_ptr) {
;
}
- if (*char_ptr != ch || cur >= n)
+ if (cur >= n || *char_ptr != ch)
return static_cast<void *>(nullptr);
return const_cast<unsigned char *>(char_ptr);
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index 0eb373c..42e8faa 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -187,6 +187,7 @@ add_header_library(
DEPENDS
libc.src.__support.CPP.type_traits
libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.macros.properties.architectures
)
add_libc_test(
diff --git a/libc/test/src/stdlib/StrfromTest.h b/libc/test/src/stdlib/StrfromTest.h
index fd2e0f12..3dacfca 100644
--- a/libc/test/src/stdlib/StrfromTest.h
+++ b/libc/test/src/stdlib/StrfromTest.h
@@ -8,6 +8,7 @@
#include "src/__support/CPP/type_traits.h"
#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/properties/architectures.h"
#include "test/UnitTest/ErrnoCheckingTest.h"
#include "test/UnitTest/ErrnoSetterMatcher.h"
#include "test/UnitTest/Test.h"
@@ -484,7 +485,9 @@ public:
ASSERT_STREQ_LEN(written, buff, "-NAN");
}
+ // https://github.com/llvm/llvm-project/issues/166795
void charsWrittenOverflow(FunctionT func) {
+#ifndef LIBC_TARGET_ARCH_IS_RISCV32
char buff[100];
// Trigger an overflow in the return value of strfrom by writing more than
// INT_MAX bytes.
@@ -492,6 +495,7 @@ public:
EXPECT_LT(result, 0);
ASSERT_ERRNO_FAILURE();
+#endif
}
};
diff --git a/libc/test/src/string/memchr_test.cpp b/libc/test/src/string/memchr_test.cpp
index ede8411..a92c5fe 100644
--- a/libc/test/src/string/memchr_test.cpp
+++ b/libc/test/src/string/memchr_test.cpp
@@ -21,6 +21,11 @@ const char *call_memchr(const void *src, int c, size_t size) {
return reinterpret_cast<const char *>(LIBC_NAMESPACE::memchr(src, c, size));
}
+TEST(LlvmLibcMemChrTest, WideReadMultiIteration) {
+ const char *src = "abcdefghijklmnopqrst$\n";
+ ASSERT_STREQ(call_memchr(src, '$', 22), "$\n");
+}
+
TEST(LlvmLibcMemChrTest, FindsCharacterAfterNullTerminator) {
// memchr should continue searching after a null terminator.
const size_t size = 5;
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 57032ce..46e17b5 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -262,6 +262,7 @@ set(files
__chrono/gps_clock.h
__chrono/hh_mm_ss.h
__chrono/high_resolution_clock.h
+ __chrono/is_clock.h
__chrono/leap_second.h
__chrono/literals.h
__chrono/local_info.h
diff --git a/libcxx/include/__chrono/is_clock.h b/libcxx/include/__chrono/is_clock.h
new file mode 100644
index 0000000..e63b848
--- /dev/null
+++ b/libcxx/include/__chrono/is_clock.h
@@ -0,0 +1,72 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___CHRONO_IS_CLOCK_H
+#define _LIBCPP___CHRONO_IS_CLOCK_H
+
+#include <__config>
+
+#include <__chrono/duration.h>
+#include <__chrono/time_point.h>
+#include <__concepts/same_as.h>
+#include <__type_traits/integral_constant.h>
+#include <__type_traits/is_arithmetic.h>
+#include <__type_traits/is_class.h>
+#include <__type_traits/is_union.h>
+#include <ratio>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+# pragma GCC system_header
+#endif
+
+#if _LIBCPP_STD_VER >= 20
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace chrono {
+
+// Helper to check that _Tp::time_point has the form time_point<_, typename _Tp::duration>.
+template <class _TimePoint, class _ClockType>
+inline constexpr bool __is_valid_clock_time_point_v = false;
+
+template <class _TimePointClock, class _ClockType>
+inline constexpr bool
+ __is_valid_clock_time_point_v<time_point<_TimePointClock, typename _ClockType::duration>, _ClockType> = true;
+
+// Check if a clock satisfies the Cpp17Clock requirements as defined in [time.clock.req]
+template <class _Tp>
+_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_clock_v = requires {
+ typename _Tp::rep;
+ requires is_arithmetic_v<typename _Tp::rep> || is_class_v<typename _Tp::rep> || is_union_v<typename _Tp::rep>;
+
+ typename _Tp::period;
+ requires __is_ratio_v<typename _Tp::period>;
+
+ typename _Tp::duration;
+ requires same_as<typename _Tp::duration, duration<typename _Tp::rep, typename _Tp::period>>;
+
+ typename _Tp::time_point;
+ requires __is_valid_clock_time_point_v<typename _Tp::time_point, _Tp>;
+
+ _Tp::is_steady;
+ requires same_as<decltype((_Tp::is_steady)), const bool&>;
+
+ _Tp::now();
+ requires same_as<decltype(_Tp::now()), typename _Tp::time_point>;
+};
+
+template <class _Tp>
+struct _LIBCPP_NO_SPECIALIZATIONS is_clock : bool_constant<is_clock_v<_Tp>> {};
+
+} // namespace chrono
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER
+#endif // _LIBCPP___CHRONO_IS_CLOCK_H
diff --git a/libcxx/include/chrono b/libcxx/include/chrono
index 82e99a3..aa4fc62 100644
--- a/libcxx/include/chrono
+++ b/libcxx/include/chrono
@@ -218,6 +218,9 @@ template <class ToDuration, class Rep, class Period>
template <class ToDuration, class Rep, class Period>
constexpr ToDuration round(const duration<Rep, Period>& d); // C++17
+template <class T> struct is_clock; // C++20
+template <class T> inline constexpr bool is_clock_v = is_clock<T>::value; // C++20
+
// duration I/O
template<class charT, class traits, class Rep, class Period> // C++20
basic_ostream<charT, traits>&
@@ -1057,6 +1060,7 @@ constexpr chrono::year operator ""y(unsigned lo
# include <__chrono/day.h>
# include <__chrono/exception.h>
# include <__chrono/hh_mm_ss.h>
+# include <__chrono/is_clock.h>
# include <__chrono/literals.h>
# include <__chrono/local_info.h>
# include <__chrono/month.h>
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 24a2fe7..f77c885 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -973,6 +973,10 @@ module std [system] {
header "__chrono/high_resolution_clock.h"
export *
}
+ module is_clock {
+ header "__chrono/is_clock.h"
+ export std_core.type_traits.integral_constant
+ }
module leap_second {
header "__chrono/leap_second.h"
}
diff --git a/libcxx/modules/std/chrono.inc b/libcxx/modules/std/chrono.inc
index 66eccd8..db405d48 100644
--- a/libcxx/modules/std/chrono.inc
+++ b/libcxx/modules/std/chrono.inc
@@ -25,8 +25,8 @@ export namespace std {
using std::chrono::duration_values;
- // using std::chrono::is_clock;
- // using std::chrono::is_clock_v;
+ using std::chrono::is_clock;
+ using std::chrono::is_clock_v;
// [time.duration.nonmember], duration arithmetic
using std::chrono::operator+;
diff --git a/libcxx/test/libcxx/time/time.traits.is.clock/trait.is.clock.compile.verify.cpp b/libcxx/test/libcxx/time/time.traits.is.clock/trait.is.clock.compile.verify.cpp
new file mode 100644
index 0000000..e9ad59a
--- /dev/null
+++ b/libcxx/test/libcxx/time/time.traits.is.clock/trait.is.clock.compile.verify.cpp
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++20
+
+#include <chrono>
+#include <ratio>
+
+#if !__has_warning("-Winvalid-specializations")
+// expected-no-diagnostics
+#else
+
+template <>
+struct std::chrono::is_clock<int> : std::false_type {}; // expected-error@*:* {{'is_clock' cannot be specialized}}
+
+template <>
+constexpr bool std::chrono::is_clock_v<float> = false; // expected-error@*:* {{'is_clock_v' cannot be specialized}}
+
+#endif
diff --git a/libcxx/test/std/time/time.traits.is.clock/trait.is.clock.compile.pass.cpp b/libcxx/test/std/time/time.traits.is.clock/trait.is.clock.compile.pass.cpp
new file mode 100644
index 0000000..4168fa7
--- /dev/null
+++ b/libcxx/test/std/time/time.traits.is.clock/trait.is.clock.compile.pass.cpp
@@ -0,0 +1,225 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++20
+
+#include <chrono>
+#include <ratio>
+
+#include "test_macros.h"
+
+struct EmptyStruct {};
+
+// Test structs missing required members
+struct MissingRep {
+ using period = std::ratio<1>;
+ using duration = std::chrono::seconds;
+ using time_point = std::chrono::time_point<MissingRep>;
+ static constexpr bool is_steady = false;
+ static time_point now();
+};
+
+struct MissingPeriod {
+ using rep = long;
+ using duration = std::chrono::seconds;
+ using time_point = std::chrono::time_point<MissingPeriod>;
+ static constexpr bool is_steady = false;
+ static time_point now();
+};
+
+struct MissingDuration {
+ using rep = long;
+ using time_point = long;
+ static constexpr bool is_steady = false;
+ static time_point now();
+};
+
+struct MissingTimePoint {
+ using rep = long;
+ using period = std::ratio<1>;
+ using duration = std::chrono::seconds;
+ static constexpr bool is_steady = false;
+ static std::chrono::time_point<MissingTimePoint> now();
+};
+
+struct MissingIsSteady {
+ using rep = long;
+ using period = std::ratio<1>;
+ using duration = std::chrono::seconds;
+ using time_point = std::chrono::time_point<MissingIsSteady>;
+ static time_point now();
+};
+
+struct MissingNow {
+ using rep = long;
+ using period = std::ratio<1>;
+ using duration = std::chrono::seconds;
+ using time_point = std::chrono::time_point<MissingNow>;
+ static constexpr bool is_steady = false;
+};
+
+// Valid clock types
+struct ValidSteadyClock {
+ using rep = long long;
+ using period = std::nano;
+ using duration = std::chrono::nanoseconds;
+ using time_point = std::chrono::time_point<ValidSteadyClock>;
+ static constexpr bool is_steady = true;
+ static time_point now();
+};
+
+struct ValidSystemClock {
+ using rep = long long;
+ using period = std::micro;
+ using duration = std::chrono::microseconds;
+ using time_point = std::chrono::time_point<ValidSystemClock>;
+ static constexpr bool is_steady = false;
+ static time_point now();
+};
+
+// Test clocks with invalid is_steady type
+struct WrongIsSteadyType {
+ using rep = long;
+ using period = std::ratio<1>;
+ using duration = std::chrono::seconds;
+ using time_point = std::chrono::time_point<WrongIsSteadyType>;
+ static bool is_steady; // Not const bool
+ static time_point now();
+};
+
+struct WrongIsSteadyNonBool {
+ using rep = long;
+ using period = std::ratio<1>;
+ using duration = std::chrono::seconds;
+ using time_point = std::chrono::time_point<WrongIsSteadyNonBool>;
+ static constexpr int is_steady = 1; // Not bool
+ static time_point now();
+};
+
+// Test clocks with invalid now() return type
+struct WrongNowReturnType {
+ using rep = long;
+ using period = std::ratio<1>;
+ using duration = std::chrono::seconds;
+ using time_point = std::chrono::time_point<WrongNowReturnType>;
+ static constexpr bool is_steady = false;
+ static int now(); // Wrong return type
+};
+
+// Test clocks with invalid period type
+struct WrongPeriodType {
+ using rep = long;
+ using period = int; // Not a ratio
+ using duration = std::chrono::seconds;
+ using time_point = std::chrono::time_point<WrongPeriodType>;
+ static constexpr bool is_steady = false;
+ static time_point now();
+};
+
+// Test clocks with wrong duration type
+struct WrongDurationType {
+ using rep = long;
+ using period = std::ratio<1>;
+ using duration = std::chrono::milliseconds; // Should be duration<long, ratio<1>>
+ using time_point = std::chrono::time_point<WrongDurationType>;
+ static constexpr bool is_steady = false;
+ static time_point now();
+};
+
+// Test clocks with wrong time_point type
+struct WrongTimePointType {
+ using rep = long;
+ using period = std::ratio<1>;
+ using duration = std::chrono::duration<long, std::ratio<1>>;
+ using time_point = int; // Not a time_point
+ static constexpr bool is_steady = false;
+ static time_point now();
+};
+
+struct WrongTimePointClock {
+ using rep = long;
+ using period = std::ratio<1>;
+ using duration = std::chrono::duration<long, std::ratio<1>>;
+ using time_point = std::chrono::time_point<ValidSystemClock>; // Wrong clock type
+ static constexpr bool is_steady = false;
+ static time_point now();
+};
+
+// Valid clock with time_point that has matching duration instead of matching clock
+struct ValidClockWithDurationMatch {
+ using rep = int;
+ using period = std::milli;
+ using duration = std::chrono::duration<int, std::milli>;
+ using time_point = std::chrono::time_point<ValidSystemClock, duration>; // Valid: matches duration
+ static constexpr bool is_steady = false;
+ static time_point now();
+};
+
+// Test both is_clock and is_clock_v
+static_assert(std::chrono::is_clock<std::chrono::system_clock>::value);
+static_assert(std::chrono::is_clock_v<std::chrono::system_clock>);
+
+// Test standard clock types
+static_assert(std::chrono::is_clock_v<std::chrono::system_clock>);
+static_assert(std::chrono::is_clock_v<std::chrono::high_resolution_clock>);
+
+// Test non-clock types
+static_assert(!std::chrono::is_clock_v<EmptyStruct>);
+static_assert(!std::chrono::is_clock_v<int>);
+static_assert(!std::chrono::is_clock_v<void>);
+static_assert(!std::chrono::is_clock_v<std::chrono::system_clock::time_point>);
+static_assert(!std::chrono::is_clock_v<std::chrono::seconds>);
+static_assert(!std::chrono::is_clock_v<std::chrono::milliseconds>);
+
+// Test structs missing required members
+static_assert(!std::chrono::is_clock_v<MissingRep>);
+static_assert(!std::chrono::is_clock_v<MissingPeriod>);
+static_assert(!std::chrono::is_clock_v<MissingDuration>);
+static_assert(!std::chrono::is_clock_v<MissingTimePoint>);
+static_assert(!std::chrono::is_clock_v<MissingIsSteady>);
+static_assert(!std::chrono::is_clock_v<MissingNow>);
+
+// Test valid custom clocks
+static_assert(std::chrono::is_clock_v<ValidSteadyClock>);
+static_assert(std::chrono::is_clock_v<ValidSystemClock>);
+static_assert(std::chrono::is_clock_v<ValidClockWithDurationMatch>);
+
+// cv-qualified and reference types
+static_assert(std::chrono::is_clock_v<const std::chrono::system_clock>);
+static_assert(std::chrono::is_clock_v<volatile std::chrono::system_clock>);
+static_assert(std::chrono::is_clock_v<const volatile std::chrono::system_clock>);
+static_assert(!std::chrono::is_clock_v<std::chrono::system_clock&>);
+static_assert(!std::chrono::is_clock_v<std::chrono::system_clock&&>);
+static_assert(!std::chrono::is_clock_v<const std::chrono::system_clock&>);
+
+// array and pointer types
+static_assert(!std::chrono::is_clock_v<std::chrono::system_clock[]>);
+static_assert(!std::chrono::is_clock_v<std::chrono::system_clock[10]>);
+static_assert(!std::chrono::is_clock_v<std::chrono::system_clock*>);
+static_assert(!std::chrono::is_clock_v<std::chrono::system_clock* const>);
+
+// The Standard defined a minimum set of checks and allowed implementation to perform stricter checks. The following
+// static asserts are implementation specific and a conforming standard library implementation doesn't have to produce
+// the same outcome.
+
+// Test clocks with invalid is_steady type
+LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongIsSteadyType>); // is_steady not const bool
+LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongIsSteadyNonBool>); // is_steady not bool type
+
+// Test clocks with invalid now() return type
+LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongNowReturnType>); // now() doesn't return time_point
+
+// Test clocks with invalid period type
+LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongPeriodType>); // period is not a ratio
+
+// Test clocks with wrong duration type
+LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongDurationType>); // duration doesn't match duration<rep, period>
+
+// Test clocks with wrong time_point type
+LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongTimePointType>); // time_point is not a time_point
+LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongTimePointClock>); // time_point has wrong clock and wrong duration
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index a4150eb..9a70c0d 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -54,8 +54,6 @@ using llvm::support::endian::read32le;
using llvm::support::endian::write32le;
using llvm::support::endian::write64le;
-constexpr size_t MergeNoTailSection::numShards;
-
static uint64_t readUint(Ctx &ctx, uint8_t *buf) {
return ctx.arg.is64 ? read64(ctx, buf) : read32(ctx, buf);
}
diff --git a/lldb/bindings/python/CMakeLists.txt b/lldb/bindings/python/CMakeLists.txt
index ef6def3..28a8af8 100644
--- a/lldb/bindings/python/CMakeLists.txt
+++ b/lldb/bindings/python/CMakeLists.txt
@@ -107,6 +107,7 @@ function(finish_swig_python swig_target lldb_python_bindings_dir lldb_python_tar
"plugins"
FILES
"${LLDB_SOURCE_DIR}/examples/python/templates/parsed_cmd.py"
+ "${LLDB_SOURCE_DIR}/examples/python/templates/scripted_frame_provider.py"
"${LLDB_SOURCE_DIR}/examples/python/templates/scripted_process.py"
"${LLDB_SOURCE_DIR}/examples/python/templates/scripted_platform.py"
"${LLDB_SOURCE_DIR}/examples/python/templates/operating_system.py"
diff --git a/lldb/bindings/python/python-swigsafecast.swig b/lldb/bindings/python/python-swigsafecast.swig
index 3ea24f1..a86dc44 100644
--- a/lldb/bindings/python/python-swigsafecast.swig
+++ b/lldb/bindings/python/python-swigsafecast.swig
@@ -37,6 +37,11 @@ PythonObject SWIGBridge::ToSWIGWrapper(lldb::ThreadPlanSP thread_plan_sp) {
SWIGTYPE_p_lldb__SBThreadPlan);
}
+PythonObject SWIGBridge::ToSWIGWrapper(lldb::StackFrameListSP frames_sp) {
+ return ToSWIGHelper(new lldb::SBFrameList(std::move(frames_sp)),
+ SWIGTYPE_p_lldb__SBFrameList);
+}
+
PythonObject SWIGBridge::ToSWIGWrapper(lldb::BreakpointSP breakpoint_sp) {
return ToSWIGHelper(new lldb::SBBreakpoint(std::move(breakpoint_sp)),
SWIGTYPE_p_lldb__SBBreakpoint);
diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig
index e7acba5..3a0995e 100644
--- a/lldb/bindings/python/python-wrapper.swig
+++ b/lldb/bindings/python/python-wrapper.swig
@@ -556,6 +556,18 @@ void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBExecutionContext(PyOb
return sb_ptr;
}
+void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBFrameList(PyObject *data) {
+ lldb::SBFrameList *sb_ptr = NULL;
+
+ int valid_cast = SWIG_ConvertPtr(data, (void **)&sb_ptr,
+ SWIGTYPE_p_lldb__SBFrameList, 0);
+
+ if (valid_cast == -1)
+ return NULL;
+
+ return sb_ptr;
+}
+
bool lldb_private::python::SWIGBridge::LLDBSwigPythonCallCommand(
const char *python_function_name, const char *session_dictionary_name,
lldb::DebuggerSP debugger, const char *args,
diff --git a/lldb/examples/python/templates/scripted_frame_provider.py b/lldb/examples/python/templates/scripted_frame_provider.py
new file mode 100644
index 0000000..20f4d76
--- /dev/null
+++ b/lldb/examples/python/templates/scripted_frame_provider.py
@@ -0,0 +1,113 @@
+from abc import ABCMeta, abstractmethod
+
+import lldb
+
+
+class ScriptedFrameProvider(metaclass=ABCMeta):
+ """
+ The base class for a scripted frame provider.
+
+ A scripted frame provider allows you to provide custom stack frames for a
+ thread, which can be used to augment or replace the standard unwinding
+ mechanism. This is useful for:
+
+ - Providing frames for custom calling conventions or languages
+ - Reconstructing missing frames from crash dumps or core files
+ - Adding diagnostic or synthetic frames for debugging
+ - Visualizing state machines or async execution contexts
+
+ Most of the base class methods are `@abstractmethod` that need to be
+ overwritten by the inheriting class.
+
+ Example usage:
+
+ .. code-block:: python
+
+ # Attach a frame provider to a thread
+ thread = process.GetSelectedThread()
+ error = thread.SetScriptedFrameProvider(
+ "my_module.MyFrameProvider",
+ lldb.SBStructuredData()
+ )
+ """
+
+ @abstractmethod
+ def __init__(self, input_frames, args):
+ """Construct a scripted frame provider.
+
+ Args:
+ input_frames (lldb.SBFrameList): The frame list to use as input.
+ This allows you to access frames by index. The frames are
+ materialized lazily as you access them.
+ args (lldb.SBStructuredData): A Dictionary holding arbitrary
+ key/value pairs used by the scripted frame provider.
+ """
+ self.input_frames = None
+ self.args = None
+ self.thread = None
+ self.target = None
+ self.process = None
+
+ if isinstance(input_frames, lldb.SBFrameList) and input_frames.IsValid():
+ self.input_frames = input_frames
+ self.thread = input_frames.GetThread()
+ if self.thread and self.thread.IsValid():
+ self.process = self.thread.GetProcess()
+ if self.process and self.process.IsValid():
+ self.target = self.process.GetTarget()
+
+ if isinstance(args, lldb.SBStructuredData) and args.IsValid():
+ self.args = args
+
+ @abstractmethod
+ def get_frame_at_index(self, index):
+ """Get a single stack frame at the given index.
+
+ This method is called lazily when a specific frame is needed in the
+ thread's backtrace (e.g., via the 'bt' command). Each frame is
+ requested individually as needed.
+
+ Args:
+ index (int): The frame index to retrieve (0 for youngest/top frame).
+
+ Returns:
+ Dict or None: A frame dictionary describing the stack frame, or None
+ if no frame exists at this index. The dictionary should contain:
+
+ Required fields:
+ - idx (int): The synthetic frame index (0 for youngest/top frame)
+ - pc (int): The program counter address for the synthetic frame
+
+ Alternatively, you can return:
+ - A ScriptedFrame object for full control over frame behavior
+ - An integer representing an input frame index to reuse
+ - None to indicate no more frames exist
+
+ Example:
+
+ .. code-block:: python
+
+ def get_frame_at_index(self, index):
+ # Return None when there are no more frames
+ if index >= self.total_frames:
+ return None
+
+ # Re-use an input frame by returning its index
+ if self.should_use_input_frame(index):
+ return index # Returns input frame at this index
+
+ # Or create a custom frame dictionary
+ if index == 0:
+ return {
+ "idx": 0,
+ "pc": 0x100001234,
+ }
+
+ return None
+
+ Note:
+ The frames are indexed from 0 (youngest/top) to N (oldest/bottom).
+ This method will be called repeatedly with increasing indices until
+ None is returned.
+ """
+ pass
diff --git a/lldb/include/lldb/API/SBFrameList.h b/lldb/include/lldb/API/SBFrameList.h
index dba1c1d..0039ffb 100644
--- a/lldb/include/lldb/API/SBFrameList.h
+++ b/lldb/include/lldb/API/SBFrameList.h
@@ -11,6 +11,16 @@
#include "lldb/API/SBDefines.h"
+namespace lldb_private {
+class ScriptInterpreter;
+namespace python {
+class SWIGBridge;
+}
+namespace lua {
+class SWIGBridge;
+}
+} // namespace lldb_private
+
namespace lldb {
/// Represents a list of SBFrame objects.
@@ -66,6 +76,10 @@ public:
protected:
friend class SBThread;
+ friend class lldb_private::python::SWIGBridge;
+ friend class lldb_private::lua::SWIGBridge;
+ friend class lldb_private::ScriptInterpreter;
+
private:
SBFrameList(const lldb::StackFrameListSP &frame_list_sp);
diff --git a/lldb/include/lldb/API/SBModuleSpec.h b/lldb/include/lldb/API/SBModuleSpec.h
index 8d1ecfe..b80a52b 100644
--- a/lldb/include/lldb/API/SBModuleSpec.h
+++ b/lldb/include/lldb/API/SBModuleSpec.h
@@ -87,6 +87,16 @@ public:
bool GetDescription(lldb::SBStream &description);
+ lldb::SBTarget GetTarget();
+
+ /// Set the target to be used when resolving a module.
+ ///
+ /// A target can help locate a module specified by a SBModuleSpec. The
+ /// target settings, like the executable and debug info search paths, can
+ /// be essential. The target's platform can also be used to locate or download
+ /// the specified module.
+ void SetTarget(lldb::SBTarget target);
+
private:
friend class SBModuleSpecList;
friend class SBModule;
diff --git a/lldb/include/lldb/API/SBTarget.h b/lldb/include/lldb/API/SBTarget.h
index 173fd05..379a0bb 100644
--- a/lldb/include/lldb/API/SBTarget.h
+++ b/lldb/include/lldb/API/SBTarget.h
@@ -999,6 +999,7 @@ protected:
friend class SBFunction;
friend class SBInstruction;
friend class SBModule;
+ friend class SBModuleSpec;
friend class SBPlatform;
friend class SBProcess;
friend class SBSection;
diff --git a/lldb/include/lldb/Core/ModuleList.h b/lldb/include/lldb/Core/ModuleList.h
index e71f3b2..df473df 100644
--- a/lldb/include/lldb/Core/ModuleList.h
+++ b/lldb/include/lldb/Core/ModuleList.h
@@ -476,9 +476,9 @@ public:
static Status
GetSharedModule(const ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
- const FileSpecList *module_search_paths_ptr,
llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
- bool *did_create_ptr, bool always_create = false);
+ bool *did_create_ptr, bool always_create = false,
+ bool invoke_locate_callback = true);
static bool RemoveSharedModule(lldb::ModuleSP &module_sp);
diff --git a/lldb/include/lldb/Core/ModuleSpec.h b/lldb/include/lldb/Core/ModuleSpec.h
index 86be038..acbc85b 100644
--- a/lldb/include/lldb/Core/ModuleSpec.h
+++ b/lldb/include/lldb/Core/ModuleSpec.h
@@ -16,9 +16,11 @@
#include "lldb/Utility/Iterable.h"
#include "lldb/Utility/Stream.h"
#include "lldb/Utility/UUID.h"
+#include "lldb/lldb-forward.h"
#include "llvm/Support/Chrono.h"
+#include <memory>
#include <mutex>
#include <vector>
@@ -126,6 +128,16 @@ public:
lldb::DataBufferSP GetData() const { return m_data; }
+ lldb::TargetSP GetTargetSP() const { return m_target_wp.lock(); }
+
+ /// Set the target to be used when resolving a module.
+ ///
+ /// A target can help locate a module specified by a ModuleSpec. The target
+ /// settings, like the executable and debug info search paths, can be
+ /// essential. The target's platform can also be used to locate or download
+ /// the specified module.
+ void SetTarget(std::shared_ptr<Target> target) { m_target_wp = target; }
+
void Clear() {
m_file.Clear();
m_platform_file.Clear();
@@ -137,6 +149,7 @@ public:
m_object_size = 0;
m_source_mappings.Clear(false);
m_object_mod_time = llvm::sys::TimePoint<>();
+ m_target_wp.reset();
}
explicit operator bool() const {
@@ -265,6 +278,11 @@ protected:
ArchSpec m_arch;
UUID m_uuid;
ConstString m_object_name;
+ /// The target used when resolving a module. A target can help locate a module
+ /// specified by a ModuleSpec. The target settings, like the executable and
+ /// debug info search paths, can be essential. The target's platform can also
+ /// be used to locate or download the specified module.
+ std::weak_ptr<Target> m_target_wp;
uint64_t m_object_offset = 0;
uint64_t m_object_size = 0;
llvm::sys::TimePoint<> m_object_mod_time;
diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h
index aa60b7c..ab2ca58 100644
--- a/lldb/include/lldb/Core/PluginManager.h
+++ b/lldb/include/lldb/Core/PluginManager.h
@@ -356,6 +356,24 @@ public:
GetScriptInterpreterForLanguage(lldb::ScriptLanguage script_lang,
Debugger &debugger);
+ // SyntheticFrameProvider
+ static bool
+ RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
+ SyntheticFrameProviderCreateInstance create_native_callback,
+ ScriptedFrameProviderCreateInstance create_scripted_callback);
+
+ static bool
+ UnregisterPlugin(SyntheticFrameProviderCreateInstance create_callback);
+
+ static bool
+ UnregisterPlugin(ScriptedFrameProviderCreateInstance create_callback);
+
+ static SyntheticFrameProviderCreateInstance
+ GetSyntheticFrameProviderCreateCallbackForPluginName(llvm::StringRef name);
+
+ static ScriptedFrameProviderCreateInstance
+ GetScriptedFrameProviderCreateCallbackAtIndex(uint32_t idx);
+
// StructuredDataPlugin
/// Register a StructuredDataPlugin class along with optional
diff --git a/lldb/include/lldb/Core/Section.h b/lldb/include/lldb/Core/Section.h
index f0f5a0b..d0f10cc 100644
--- a/lldb/include/lldb/Core/Section.h
+++ b/lldb/include/lldb/Core/Section.h
@@ -273,6 +273,9 @@ public:
/// return true.
bool ContainsOnlyDebugInfo() const;
+ /// Returns true if this is a global offset table section.
+ bool IsGOTSection() const;
+
protected:
ObjectFile *m_obj_file; // The object file that data for this section should
// be read from
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h
new file mode 100644
index 0000000..2d9f713
--- /dev/null
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_INTERPRETER_INTERFACES_SCRIPTEDFRAMEPROVIDERINTERFACE_H
+#define LLDB_INTERPRETER_INTERFACES_SCRIPTEDFRAMEPROVIDERINTERFACE_H
+
+#include "lldb/lldb-private.h"
+
+#include "ScriptedInterface.h"
+
+namespace lldb_private {
+class ScriptedFrameProviderInterface : public ScriptedInterface {
+public:
+ virtual llvm::Expected<StructuredData::GenericSP>
+ CreatePluginObject(llvm::StringRef class_name,
+ lldb::StackFrameListSP input_frames,
+ StructuredData::DictionarySP args_sp) = 0;
+
+ virtual StructuredData::ObjectSP GetFrameAtIndex(uint32_t index) {
+ return {};
+ }
+};
+} // namespace lldb_private
+
+#endif // LLDB_INTERPRETER_INTERFACES_SCRIPTEDFRAMEPROVIDERINTERFACE_H
diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h
index edb80dc..7fed494 100644
--- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h
+++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h
@@ -16,6 +16,7 @@
#include "lldb/API/SBError.h"
#include "lldb/API/SBEvent.h"
#include "lldb/API/SBExecutionContext.h"
+#include "lldb/API/SBFrameList.h"
#include "lldb/API/SBLaunchInfo.h"
#include "lldb/API/SBMemoryRegionInfo.h"
#include "lldb/API/SBStream.h"
@@ -28,6 +29,7 @@
#include "lldb/Host/StreamFile.h"
#include "lldb/Interpreter/Interfaces/OperatingSystemInterface.h"
#include "lldb/Interpreter/Interfaces/ScriptedFrameInterface.h"
+#include "lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h"
#include "lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h"
#include "lldb/Interpreter/Interfaces/ScriptedProcessInterface.h"
#include "lldb/Interpreter/Interfaces/ScriptedThreadInterface.h"
@@ -537,6 +539,11 @@ public:
return {};
}
+ virtual lldb::ScriptedFrameProviderInterfaceSP
+ CreateScriptedFrameProviderInterface() {
+ return {};
+ }
+
virtual lldb::ScriptedThreadPlanInterfaceSP
CreateScriptedThreadPlanInterface() {
return {};
@@ -596,6 +603,9 @@ public:
lldb::ExecutionContextRefSP GetOpaqueTypeFromSBExecutionContext(
const lldb::SBExecutionContext &exe_ctx) const;
+ lldb::StackFrameListSP
+ GetOpaqueTypeFromSBFrameList(const lldb::SBFrameList &exe_ctx) const;
+
protected:
Debugger &m_debugger;
lldb::ScriptLanguage m_script_lang;
diff --git a/lldb/include/lldb/Symbol/ObjectFile.h b/lldb/include/lldb/Symbol/ObjectFile.h
index 1b9ae1f..1de08a8 100644
--- a/lldb/include/lldb/Symbol/ObjectFile.h
+++ b/lldb/include/lldb/Symbol/ObjectFile.h
@@ -758,6 +758,12 @@ public:
return false;
}
+ /// Returns true if the section is a global offset table section.
+ virtual bool IsGOTSection(const lldb_private::Section &section) const {
+ assert(section.GetObjectFile() == this && "Wrong object file!");
+ return false;
+ }
+
/// Get a hash that can be used for caching object file releated information.
///
/// Data for object files can be cached between runs of debug sessions and
diff --git a/lldb/include/lldb/Target/Platform.h b/lldb/include/lldb/Target/Platform.h
index 35ffdab..1104722 100644
--- a/lldb/include/lldb/Target/Platform.h
+++ b/lldb/include/lldb/Target/Platform.h
@@ -127,8 +127,7 @@ public:
/// Returns \b true if this Platform plug-in was able to find
/// a suitable executable, \b false otherwise.
virtual Status ResolveExecutable(const ModuleSpec &module_spec,
- lldb::ModuleSP &exe_module_sp,
- const FileSpecList *module_search_paths_ptr);
+ lldb::ModuleSP &exe_module_sp);
/// Find a symbol file given a symbol file module specification.
///
@@ -304,10 +303,11 @@ public:
/// \return
/// The Status object for any errors found while searching for
/// the binary.
- virtual Status GetSharedModule(
- const ModuleSpec &module_spec, Process *process,
- lldb::ModuleSP &module_sp, const FileSpecList *module_search_paths_ptr,
- llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr);
+ virtual Status
+ GetSharedModule(const ModuleSpec &module_spec, Process *process,
+ lldb::ModuleSP &module_sp,
+ llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
+ bool *did_create_ptr);
void CallLocateModuleCallbackIfSet(const ModuleSpec &module_spec,
lldb::ModuleSP &module_sp,
@@ -1039,8 +1039,8 @@ protected:
/// predefined trap handlers, this method may be a no-op.
virtual void CalculateTrapHandlerSymbolNames() = 0;
- Status GetCachedExecutable(ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
- const FileSpecList *module_search_paths_ptr);
+ Status GetCachedExecutable(ModuleSpec &module_spec,
+ lldb::ModuleSP &module_sp);
virtual Status DownloadModuleSlice(const FileSpec &src_file_spec,
const uint64_t src_offset,
diff --git a/lldb/include/lldb/Target/RemoteAwarePlatform.h b/lldb/include/lldb/Target/RemoteAwarePlatform.h
index fb2eecf..de13b18 100644
--- a/lldb/include/lldb/Target/RemoteAwarePlatform.h
+++ b/lldb/include/lldb/Target/RemoteAwarePlatform.h
@@ -20,10 +20,8 @@ class RemoteAwarePlatform : public Platform {
public:
using Platform::Platform;
- virtual Status
- ResolveExecutable(const ModuleSpec &module_spec,
- lldb::ModuleSP &exe_module_sp,
- const FileSpecList *module_search_paths_ptr) override;
+ virtual Status ResolveExecutable(const ModuleSpec &module_spec,
+ lldb::ModuleSP &exe_module_sp) override;
bool GetModuleSpec(const FileSpec &module_file_spec, const ArchSpec &arch,
ModuleSpec &module_spec) override;
diff --git a/lldb/include/lldb/Target/SyntheticFrameProvider.h b/lldb/include/lldb/Target/SyntheticFrameProvider.h
new file mode 100644
index 0000000..61a492f
--- /dev/null
+++ b/lldb/include/lldb/Target/SyntheticFrameProvider.h
@@ -0,0 +1,156 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_TARGET_SYNTHETICFRAMEPROVIDER_H
+#define LLDB_TARGET_SYNTHETICFRAMEPROVIDER_H
+
+#include "lldb/Core/PluginInterface.h"
+#include "lldb/Target/StackFrameList.h"
+#include "lldb/Target/ThreadSpec.h"
+#include "lldb/Utility/ScriptedMetadata.h"
+#include "lldb/Utility/Status.h"
+#include "lldb/lldb-forward.h"
+#include "llvm/Support/Error.h"
+
+#include <optional>
+#include <vector>
+
+namespace lldb_private {
+
+/// This struct contains the metadata needed to instantiate a frame provider
+/// and optional filters to control which threads it applies to.
+struct SyntheticFrameProviderDescriptor {
+ /// Metadata for instantiating the provider (e.g. script class name and args).
+ lldb::ScriptedMetadataSP scripted_metadata_sp;
+
+ /// Optional list of thread specifications to which this provider applies.
+ /// If empty, the provider applies to all threads. A thread matches if it
+ /// satisfies ANY of the specs in this vector (OR logic).
+ std::vector<ThreadSpec> thread_specs;
+
+ SyntheticFrameProviderDescriptor() = default;
+
+ SyntheticFrameProviderDescriptor(lldb::ScriptedMetadataSP metadata_sp)
+ : scripted_metadata_sp(metadata_sp) {}
+
+ SyntheticFrameProviderDescriptor(lldb::ScriptedMetadataSP metadata_sp,
+ const std::vector<ThreadSpec> &specs)
+ : scripted_metadata_sp(metadata_sp), thread_specs(specs) {}
+
+ /// Get the name of this descriptor (the scripted class name).
+ llvm::StringRef GetName() const {
+ return scripted_metadata_sp ? scripted_metadata_sp->GetClassName() : "";
+ }
+
+ /// Check if this descriptor applies to the given thread.
+ bool AppliesToThread(Thread &thread) const {
+ // If no thread specs specified, applies to all threads.
+ if (thread_specs.empty())
+ return true;
+
+ // Check if the thread matches any of the specs (OR logic).
+ for (const auto &spec : thread_specs) {
+ if (spec.ThreadPassesBasicTests(thread))
+ return true;
+ }
+ return false;
+ }
+
+ /// Check if this descriptor has valid metadata for script-based providers.
+ bool IsValid() const { return scripted_metadata_sp != nullptr; }
+
+ void Dump(Stream *s) const;
+};
+
+/// Base class for all synthetic frame providers.
+///
+/// Synthetic frame providers allow modifying or replacing the stack frames
+/// shown for a thread. This is useful for:
+/// - Providing frames for custom calling conventions or languages.
+/// - Reconstructing missing frames from crash dumps or core files.
+/// - Adding diagnostic or synthetic frames for debugging.
+/// - Visualizing state machines or async execution contexts.
+class SyntheticFrameProvider : public PluginInterface {
+public:
+ /// Try to create a SyntheticFrameProvider instance for the given input
+ /// frames and descriptor.
+ ///
+ /// This method iterates through all registered SyntheticFrameProvider
+ /// plugins and returns the first one that can handle the given descriptor.
+ ///
+ /// \param[in] input_frames
+ /// The input stack frame list that this provider will transform.
+ /// This could be real unwound frames or output from another provider.
+ ///
+ /// \param[in] descriptor
+ /// The descriptor containing metadata for the provider.
+ ///
+ /// \return
+ /// A shared pointer to a SyntheticFrameProvider if one could be created,
+ /// otherwise an \a llvm::Error.
+ static llvm::Expected<lldb::SyntheticFrameProviderSP>
+ CreateInstance(lldb::StackFrameListSP input_frames,
+ const SyntheticFrameProviderDescriptor &descriptor);
+
+ /// Try to create a SyntheticFrameProvider instance for the given input
+ /// frames using a specific C++ plugin.
+ ///
+ /// This method directly invokes a specific SyntheticFrameProvider plugin
+ /// by name, bypassing the descriptor-based plugin iteration. This is useful
+ /// for C++ plugins that don't require scripted metadata.
+ ///
+ /// \param[in] input_frames
+ /// The input stack frame list that this provider will transform.
+ /// This could be real unwound frames or output from another provider.
+ ///
+ /// \param[in] plugin_name
+ /// The name of the plugin to use for creating the provider.
+ ///
+ /// \param[in] thread_specs
+ /// Optional list of thread specifications to which this provider applies.
+ /// If empty, the provider applies to all threads.
+ ///
+ /// \return
+ /// A shared pointer to a SyntheticFrameProvider if one could be created,
+ /// otherwise an \a llvm::Error.
+ static llvm::Expected<lldb::SyntheticFrameProviderSP>
+ CreateInstance(lldb::StackFrameListSP input_frames,
+ llvm::StringRef plugin_name,
+ const std::vector<ThreadSpec> &thread_specs = {});
+
+ ~SyntheticFrameProvider() override;
+
+ /// Get a single stack frame at the specified index.
+ ///
+ /// This method is called lazily - frames are only created when requested.
+ /// The provider can access its input frames via GetInputFrames() if needed.
+ ///
+ /// \param[in] idx
+ /// The index of the frame to create.
+ ///
+ /// \return
+ /// An Expected containing the StackFrameSP if successful. Returns an
+ /// error when the index is beyond the last frame to signal the end of
+ /// the frame list.
+ virtual llvm::Expected<lldb::StackFrameSP> GetFrameAtIndex(uint32_t idx) = 0;
+
+ /// Get the thread associated with this provider.
+ Thread &GetThread() { return m_input_frames->GetThread(); }
+
+ /// Get the input frames that this provider transforms.
+ lldb::StackFrameListSP GetInputFrames() const { return m_input_frames; }
+
+protected:
+ SyntheticFrameProvider(lldb::StackFrameListSP input_frames);
+
+ lldb::StackFrameListSP m_input_frames;
+};
+
+} // namespace lldb_private
+
+#endif // LLDB_TARGET_SYNTHETICFRAMEPROVIDER_H
diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h
index af5656b..8b8d081 100644
--- a/lldb/include/lldb/lldb-forward.h
+++ b/lldb/include/lldb/lldb-forward.h
@@ -188,6 +188,7 @@ class Scalar;
class ScriptInterpreter;
class ScriptInterpreterLocker;
class ScriptedFrameInterface;
+class ScriptedFrameProviderInterface;
class ScriptedMetadata;
class ScriptedBreakpointInterface;
class ScriptedPlatformInterface;
@@ -235,6 +236,7 @@ class SymbolVendor;
class Symtab;
class SyntheticChildren;
class SyntheticChildrenFrontEnd;
+class SyntheticFrameProvider;
class SystemRuntime;
class Progress;
class Target;
@@ -411,6 +413,10 @@ typedef std::shared_ptr<lldb_private::ScriptSummaryFormat>
typedef std::shared_ptr<lldb_private::ScriptInterpreter> ScriptInterpreterSP;
typedef std::shared_ptr<lldb_private::ScriptedFrameInterface>
ScriptedFrameInterfaceSP;
+typedef std::shared_ptr<lldb_private::ScriptedFrameProviderInterface>
+ ScriptedFrameProviderInterfaceSP;
+typedef std::shared_ptr<lldb_private::SyntheticFrameProvider>
+ SyntheticFrameProviderSP;
typedef std::shared_ptr<lldb_private::ScriptedMetadata> ScriptedMetadataSP;
typedef std::unique_ptr<lldb_private::ScriptedPlatformInterface>
ScriptedPlatformInterfaceUP;
diff --git a/lldb/include/lldb/lldb-private-interfaces.h b/lldb/include/lldb/lldb-private-interfaces.h
index 249b25c..2fe3af7 100644
--- a/lldb/include/lldb/lldb-private-interfaces.h
+++ b/lldb/include/lldb/lldb-private-interfaces.h
@@ -25,6 +25,7 @@ class Value;
namespace lldb_private {
class ScriptedInterfaceUsages;
+struct SyntheticFrameProviderDescriptor;
typedef lldb::ABISP (*ABICreateInstance)(lldb::ProcessSP process_sp,
const ArchSpec &arch);
typedef std::unique_ptr<Architecture> (*ArchitectureCreateInstance)(
@@ -86,6 +87,14 @@ typedef lldb::RegisterTypeBuilderSP (*RegisterTypeBuilderCreateInstance)(
Target &target);
typedef lldb::ScriptInterpreterSP (*ScriptInterpreterCreateInstance)(
Debugger &debugger);
+typedef llvm::Expected<lldb::SyntheticFrameProviderSP> (
+ *ScriptedFrameProviderCreateInstance)(
+ lldb::StackFrameListSP input_frames,
+ const lldb_private::SyntheticFrameProviderDescriptor &descriptor);
+typedef llvm::Expected<lldb::SyntheticFrameProviderSP> (
+ *SyntheticFrameProviderCreateInstance)(
+ lldb::StackFrameListSP input_frames,
+ const std::vector<lldb_private::ThreadSpec> &thread_specs);
typedef SymbolFile *(*SymbolFileCreateInstance)(lldb::ObjectFileSP objfile_sp);
typedef SymbolVendor *(*SymbolVendorCreateInstance)(
const lldb::ModuleSP &module_sp,
diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index 63a3522..0122fe8 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -294,6 +294,11 @@ ifeq "$(MAKE_DEBUG_NAMES)" "YES"
CFLAGS += -gpubnames
endif
+# Enable GNU POSIX extensions (e.g. kill(), usleep(), getpgid(), ...)
+ifeq "$(OS)" "Linux"
+ CFLAGS += -D_DEFAULT_SOURCE
+endif
+
ifeq "$(USE_PRIVATE_MODULE_CACHE)" "YES"
THE_CLANG_MODULE_CACHE_DIR := $(BUILDDIR)/private-module-cache
else
diff --git a/lldb/source/API/SBModule.cpp b/lldb/source/API/SBModule.cpp
index 5a57f45..32067ac 100644
--- a/lldb/source/API/SBModule.cpp
+++ b/lldb/source/API/SBModule.cpp
@@ -37,8 +37,8 @@ SBModule::SBModule(const SBModuleSpec &module_spec) {
LLDB_INSTRUMENT_VA(this, module_spec);
ModuleSP module_sp;
- Status error = ModuleList::GetSharedModule(
- *module_spec.m_opaque_up, module_sp, nullptr, nullptr, nullptr);
+ Status error = ModuleList::GetSharedModule(*module_spec.m_opaque_up,
+ module_sp, nullptr, nullptr);
if (module_sp)
SetSP(module_sp);
}
diff --git a/lldb/source/API/SBModuleSpec.cpp b/lldb/source/API/SBModuleSpec.cpp
index fbbcfea..031ba12 100644
--- a/lldb/source/API/SBModuleSpec.cpp
+++ b/lldb/source/API/SBModuleSpec.cpp
@@ -9,6 +9,7 @@
#include "lldb/API/SBModuleSpec.h"
#include "Utils.h"
#include "lldb/API/SBStream.h"
+#include "lldb/API/SBTarget.h"
#include "lldb/Core/Module.h"
#include "lldb/Core/ModuleSpec.h"
#include "lldb/Host/Host.h"
@@ -174,6 +175,18 @@ void SBModuleSpec::SetObjectSize(uint64_t object_size) {
m_opaque_up->SetObjectSize(object_size);
}
+SBTarget SBModuleSpec::GetTarget() {
+ LLDB_INSTRUMENT_VA(this);
+
+ return SBTarget(m_opaque_up->GetTargetSP());
+}
+
+void SBModuleSpec::SetTarget(SBTarget target) {
+ LLDB_INSTRUMENT_VA(this, target);
+
+ m_opaque_up->SetTarget(target.GetSP());
+}
+
SBModuleSpecList::SBModuleSpecList() : m_opaque_up(new ModuleSpecList()) {
LLDB_INSTRUMENT_VA(this);
}
diff --git a/lldb/source/Core/DynamicLoader.cpp b/lldb/source/Core/DynamicLoader.cpp
index 7580b15..b309e0f 100644
--- a/lldb/source/Core/DynamicLoader.cpp
+++ b/lldb/source/Core/DynamicLoader.cpp
@@ -227,6 +227,7 @@ ModuleSP DynamicLoader::LoadBinaryWithUUIDAndAddress(
}
}
ModuleSpec module_spec;
+ module_spec.SetTarget(target.shared_from_this());
module_spec.GetUUID() = uuid;
FileSpec name_filespec(name);
if (FileSystem::Instance().Exists(name_filespec))
@@ -238,8 +239,8 @@ ModuleSP DynamicLoader::LoadBinaryWithUUIDAndAddress(
// Has lldb already seen a module with this UUID?
// Or have external lookup enabled in DebugSymbols on macOS.
if (!module_sp)
- error = ModuleList::GetSharedModule(module_spec, module_sp, nullptr,
- nullptr, nullptr);
+ error =
+ ModuleList::GetSharedModule(module_spec, module_sp, nullptr, nullptr);
// Can lldb's symbol/executable location schemes
// find an executable and symbol file.
diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp
index c40612c..d9f8456 100644
--- a/lldb/source/Core/ModuleList.cpp
+++ b/lldb/source/Core/ModuleList.cpp
@@ -19,6 +19,8 @@
#include "lldb/Symbol/SymbolContext.h"
#include "lldb/Symbol/TypeList.h"
#include "lldb/Symbol/VariableList.h"
+#include "lldb/Target/Platform.h"
+#include "lldb/Target/Target.h"
#include "lldb/Utility/ArchSpec.h"
#include "lldb/Utility/ConstString.h"
#include "lldb/Utility/FileSpecList.h"
@@ -1038,9 +1040,9 @@ size_t ModuleList::RemoveOrphanSharedModules(bool mandatory) {
Status
ModuleList::GetSharedModule(const ModuleSpec &module_spec, ModuleSP &module_sp,
- const FileSpecList *module_search_paths_ptr,
llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
- bool *did_create_ptr, bool always_create) {
+ bool *did_create_ptr, bool always_create,
+ bool invoke_locate_callback) {
SharedModuleList &shared_module_list = GetSharedModuleList();
std::lock_guard<std::recursive_mutex> guard(shared_module_list.GetMutex());
char path[PATH_MAX];
@@ -1095,6 +1097,22 @@ ModuleList::GetSharedModule(const ModuleSpec &module_spec, ModuleSP &module_sp,
if (module_sp)
return error;
+ // Try target's platform locate module callback before second attempt.
+ if (invoke_locate_callback) {
+ TargetSP target_sp = module_spec.GetTargetSP();
+ if (target_sp && target_sp->IsValid()) {
+ if (PlatformSP platform_sp = target_sp->GetPlatform()) {
+ FileSpec symbol_file_spec;
+ platform_sp->CallLocateModuleCallbackIfSet(
+ module_spec, module_sp, symbol_file_spec, did_create_ptr);
+ if (module_sp) {
+ // The callback found a module.
+ return error;
+ }
+ }
+ }
+ }
+
module_sp = std::make_shared<Module>(module_spec);
// Make sure there are a module and an object file since we can specify a
// valid file path with an architecture that might not be in that file. By
@@ -1122,10 +1140,16 @@ ModuleList::GetSharedModule(const ModuleSpec &module_spec, ModuleSP &module_sp,
module_sp.reset();
}
- if (module_search_paths_ptr) {
- const auto num_directories = module_search_paths_ptr->GetSize();
+ // Get module search paths from the target if available.
+ lldb::TargetSP target_sp = module_spec.GetTargetSP();
+ FileSpecList module_search_paths;
+ if (target_sp)
+ module_search_paths = target_sp->GetExecutableSearchPaths();
+
+ if (!module_search_paths.IsEmpty()) {
+ const auto num_directories = module_search_paths.GetSize();
for (size_t idx = 0; idx < num_directories; ++idx) {
- auto search_path_spec = module_search_paths_ptr->GetFileSpecAtIndex(idx);
+ auto search_path_spec = module_search_paths.GetFileSpecAtIndex(idx);
FileSystem::Instance().Resolve(search_path_spec);
namespace fs = llvm::sys::fs;
if (!FileSystem::Instance().IsDirectory(search_path_spec))
diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp
index 5887367..4e3563c 100644
--- a/lldb/source/Core/PluginManager.cpp
+++ b/lldb/source/Core/PluginManager.cpp
@@ -1300,6 +1300,61 @@ PluginManager::GetScriptInterpreterForLanguage(lldb::ScriptLanguage script_lang,
return none_instance(debugger);
}
+#pragma mark SyntheticFrameProvider
+
+typedef PluginInstance<SyntheticFrameProviderCreateInstance>
+ SyntheticFrameProviderInstance;
+typedef PluginInstance<ScriptedFrameProviderCreateInstance>
+ ScriptedFrameProviderInstance;
+typedef PluginInstances<SyntheticFrameProviderInstance>
+ SyntheticFrameProviderInstances;
+typedef PluginInstances<ScriptedFrameProviderInstance>
+ ScriptedFrameProviderInstances;
+
+static SyntheticFrameProviderInstances &GetSyntheticFrameProviderInstances() {
+ static SyntheticFrameProviderInstances g_instances;
+ return g_instances;
+}
+
+static ScriptedFrameProviderInstances &GetScriptedFrameProviderInstances() {
+ static ScriptedFrameProviderInstances g_instances;
+ return g_instances;
+}
+
+bool PluginManager::RegisterPlugin(
+ llvm::StringRef name, llvm::StringRef description,
+ SyntheticFrameProviderCreateInstance create_native_callback,
+ ScriptedFrameProviderCreateInstance create_scripted_callback) {
+ if (create_native_callback)
+ return GetSyntheticFrameProviderInstances().RegisterPlugin(
+ name, description, create_native_callback);
+ else if (create_scripted_callback)
+ return GetScriptedFrameProviderInstances().RegisterPlugin(
+ name, description, create_scripted_callback);
+ return false;
+}
+
+bool PluginManager::UnregisterPlugin(
+ SyntheticFrameProviderCreateInstance create_callback) {
+ return GetSyntheticFrameProviderInstances().UnregisterPlugin(create_callback);
+}
+
+bool PluginManager::UnregisterPlugin(
+ ScriptedFrameProviderCreateInstance create_callback) {
+ return GetScriptedFrameProviderInstances().UnregisterPlugin(create_callback);
+}
+
+SyntheticFrameProviderCreateInstance
+PluginManager::GetSyntheticFrameProviderCreateCallbackForPluginName(
+ llvm::StringRef name) {
+ return GetSyntheticFrameProviderInstances().GetCallbackForName(name);
+}
+
+ScriptedFrameProviderCreateInstance
+PluginManager::GetScriptedFrameProviderCreateCallbackAtIndex(uint32_t idx) {
+ return GetScriptedFrameProviderInstances().GetCallbackAtIndex(idx);
+}
+
#pragma mark StructuredDataPlugin
struct StructuredDataPluginInstance
diff --git a/lldb/source/Core/Section.cpp b/lldb/source/Core/Section.cpp
index 02d9d86..d3f753e 100644
--- a/lldb/source/Core/Section.cpp
+++ b/lldb/source/Core/Section.cpp
@@ -471,6 +471,10 @@ bool Section::ContainsOnlyDebugInfo() const {
return false;
}
+bool Section::IsGOTSection() const {
+ return GetObjectFile()->IsGOTSection(*this);
+}
+
#pragma mark SectionList
SectionList &SectionList::operator=(const SectionList &rhs) {
diff --git a/lldb/source/Interpreter/ScriptInterpreter.cpp b/lldb/source/Interpreter/ScriptInterpreter.cpp
index ca768db..211868b 100644
--- a/lldb/source/Interpreter/ScriptInterpreter.cpp
+++ b/lldb/source/Interpreter/ScriptInterpreter.cpp
@@ -150,6 +150,11 @@ ScriptInterpreter::GetOpaqueTypeFromSBExecutionContext(
return exe_ctx.m_exe_ctx_sp;
}
+lldb::StackFrameListSP ScriptInterpreter::GetOpaqueTypeFromSBFrameList(
+ const lldb::SBFrameList &frame_list) const {
+ return frame_list.m_opaque_sp;
+}
+
lldb::ScriptLanguage
ScriptInterpreter::StringToLanguage(const llvm::StringRef &language) {
if (language.equals_insensitive(LanguageToString(eScriptLanguageNone)))
diff --git a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
index 1d210ea..2d0a4f67 100644
--- a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
+++ b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
@@ -789,6 +789,7 @@ bool DynamicLoaderDarwinKernel::KextImageInfo::LoadImageUsingMemoryModule(
// Search for the kext on the local filesystem via the UUID
if (!m_module_sp && m_uuid.IsValid()) {
ModuleSpec module_spec;
+ module_spec.SetTarget(target.shared_from_this());
module_spec.GetUUID() = m_uuid;
if (!m_uuid.IsValid())
module_spec.GetArchitecture() = target.GetArchitecture();
@@ -801,9 +802,8 @@ bool DynamicLoaderDarwinKernel::KextImageInfo::LoadImageUsingMemoryModule(
// system.
PlatformSP platform_sp(target.GetPlatform());
if (platform_sp) {
- FileSpecList search_paths = target.GetExecutableSearchPaths();
- platform_sp->GetSharedModule(module_spec, process, m_module_sp,
- &search_paths, nullptr, nullptr);
+ platform_sp->GetSharedModule(module_spec, process, m_module_sp, nullptr,
+ nullptr);
}
// Ask the Target to find this file on the local system, if possible.
diff --git a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
index 326b691..470fc2a 100644
--- a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
+++ b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
@@ -901,10 +901,9 @@ void DynamicLoaderPOSIXDYLD::ResolveExecutableModule(
if (module_sp && module_sp->MatchesModuleSpec(module_spec))
return;
+ module_spec.SetTarget(target.shared_from_this());
const auto executable_search_paths(Target::GetDefaultExecutableSearchPaths());
- auto error = platform_sp->ResolveExecutable(
- module_spec, module_sp,
- !executable_search_paths.IsEmpty() ? &executable_search_paths : nullptr);
+ auto error = platform_sp->ResolveExecutable(module_spec, module_sp);
if (error.Fail()) {
StreamString stream;
module_spec.Dump(stream);
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.h
index ad4d060..debf476 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.h
@@ -41,11 +41,11 @@ public:
/// The path to the exact module to be loaded. E.g., if the desired
/// module is std.io, then this should be { "std", "io" }.
///
- /// \param[in] exported_modules
+ /// \param[out] exported_modules
/// If non-NULL, a pointer to a vector to populate with the ID of every
/// module that is re-exported by the specified module.
///
- /// \param[in] error_stream
+ /// \param[out] error_stream
/// A stream to populate with the output of the Clang parser when
/// it tries to load the module.
///
@@ -63,11 +63,11 @@ public:
/// \param[in] cu
/// The compilation unit to scan for imported modules.
///
- /// \param[in] exported_modules
+ /// \param[out] exported_modules
/// A vector to populate with the ID of each module loaded (directly
/// and via re-exports) in this way.
///
- /// \param[in] error_stream
+ /// \param[out] error_stream
/// A stream to populate with the output of the Clang parser when
/// it tries to load the modules.
///
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index c8e520d..2218c23 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -5936,6 +5936,20 @@ Section *ObjectFileMachO::GetMachHeaderSection() {
return nullptr;
}
+bool ObjectFileMachO::IsGOTSection(const lldb_private::Section &section) const {
+ assert(section.GetObjectFile() == this && "Wrong object file!");
+ SectionSP segment = section.GetParent();
+ if (!segment)
+ return false;
+
+ const bool is_data_const_got =
+ segment->GetName() == "__DATA_CONST" && section.GetName() == "__got";
+ const bool is_auth_const_ptr =
+ segment->GetName() == "__AUTH_CONST" &&
+ (section.GetName() == "__auth_got" || section.GetName() == "__auth_ptr");
+ return is_data_const_got || is_auth_const_ptr;
+}
+
bool ObjectFileMachO::SectionIsLoadable(const Section *section) {
if (!section)
return false;
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
index 25643aa..5456f03 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
@@ -162,6 +162,8 @@ public:
lldb_private::Section *GetMachHeaderSection();
+ bool IsGOTSection(const lldb_private::Section &section) const override;
+
// PluginInterface protocol
llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp
index 4cfb0a8..47111c9 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp
@@ -90,7 +90,7 @@ void PlatformAppleSimulator::GetStatus(Stream &strm) {
if (!sdk.empty())
strm << " SDK Path: \"" << sdk << "\"\n";
else
- strm << " SDK Path: error: unable to locate SDK\n";
+ strm << " SDK Path: <unable to locate SDK>\n";
#if defined(__APPLE__)
// This will get called by subclasses, so just output status on the current
@@ -420,7 +420,6 @@ Status PlatformAppleSimulator::GetSymbolFile(const FileSpec &platform_file,
Status PlatformAppleSimulator::GetSharedModule(
const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
- const FileSpecList *module_search_paths_ptr,
llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) {
// For iOS/tvOS/watchOS, the SDK files are all cached locally on the
// host system. So first we ask for the file in the cached SDK, then
@@ -432,12 +431,10 @@ Status PlatformAppleSimulator::GetSharedModule(
error = GetSymbolFile(platform_file, module_spec.GetUUIDPtr(),
platform_module_spec.GetFileSpec());
if (error.Success()) {
- error = ResolveExecutable(platform_module_spec, module_sp,
- module_search_paths_ptr);
+ error = ResolveExecutable(platform_module_spec, module_sp);
} else {
const bool always_create = false;
- error = ModuleList::GetSharedModule(module_spec, module_sp,
- module_search_paths_ptr, old_modules,
+ error = ModuleList::GetSharedModule(module_spec, module_sp, old_modules,
did_create_ptr, always_create);
}
if (module_sp)
@@ -660,4 +657,3 @@ void PlatformAppleSimulator::Terminate() {
PlatformDarwin::Terminate();
}
}
-
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h
index 7fcf2c5..77d2a3b 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h
@@ -89,7 +89,6 @@ public:
Status GetSharedModule(const ModuleSpec &module_spec, Process *process,
lldb::ModuleSP &module_sp,
- const FileSpecList *module_search_paths_ptr,
llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
bool *did_create_ptr) override;
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
index 5aad447..8b4a3e0 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
@@ -331,7 +331,6 @@ Status PlatformDarwin::ResolveSymbolFile(Target &target,
Status PlatformDarwin::GetSharedModule(
const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
- const FileSpecList *module_search_paths_ptr,
llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) {
Status error;
module_sp.reset();
@@ -341,19 +340,22 @@ Status PlatformDarwin::GetSharedModule(
// module first.
if (m_remote_platform_sp) {
error = m_remote_platform_sp->GetSharedModule(
- module_spec, process, module_sp, module_search_paths_ptr, old_modules,
- did_create_ptr);
+ module_spec, process, module_sp, old_modules, did_create_ptr);
}
}
if (!module_sp) {
// Fall back to the local platform and find the file locally
error = Platform::GetSharedModule(module_spec, process, module_sp,
- module_search_paths_ptr, old_modules,
- did_create_ptr);
+ old_modules, did_create_ptr);
const FileSpec &platform_file = module_spec.GetFileSpec();
- if (!module_sp && module_search_paths_ptr && platform_file) {
+ // Get module search paths from the target if available.
+ TargetSP target_sp = module_spec.GetTargetSP();
+ FileSpecList module_search_paths;
+ if (target_sp)
+ module_search_paths = target_sp->GetExecutableSearchPaths();
+ if (!module_sp && !module_search_paths.IsEmpty() && platform_file) {
// We can try to pull off part of the file path up to the bundle
// directory level and try any module search paths...
FileSpec bundle_directory;
@@ -362,9 +364,9 @@ Status PlatformDarwin::GetSharedModule(
ModuleSpec new_module_spec(module_spec);
new_module_spec.GetFileSpec() = bundle_directory;
if (Host::ResolveExecutableInBundle(new_module_spec.GetFileSpec())) {
- Status new_error(Platform::GetSharedModule(
- new_module_spec, process, module_sp, nullptr, old_modules,
- did_create_ptr));
+ Status new_error(Platform::GetSharedModule(new_module_spec, process,
+ module_sp, old_modules,
+ did_create_ptr));
if (module_sp)
return new_error;
@@ -376,10 +378,10 @@ Status PlatformDarwin::GetSharedModule(
const size_t bundle_directory_len =
bundle_directory.GetPath(bundle_dir, sizeof(bundle_dir));
char new_path[PATH_MAX];
- size_t num_module_search_paths = module_search_paths_ptr->GetSize();
+ size_t num_module_search_paths = module_search_paths.GetSize();
for (size_t i = 0; i < num_module_search_paths; ++i) {
const size_t search_path_len =
- module_search_paths_ptr->GetFileSpecAtIndex(i).GetPath(
+ module_search_paths.GetFileSpecAtIndex(i).GetPath(
new_path, sizeof(new_path));
if (search_path_len < sizeof(new_path)) {
snprintf(new_path + search_path_len,
@@ -390,7 +392,7 @@ Status PlatformDarwin::GetSharedModule(
ModuleSpec new_module_spec(module_spec);
new_module_spec.GetFileSpec() = new_file_spec;
Status new_error(Platform::GetSharedModule(
- new_module_spec, process, module_sp, nullptr, old_modules,
+ new_module_spec, process, module_sp, old_modules,
did_create_ptr));
if (module_sp) {
@@ -1303,12 +1305,15 @@ PlatformDarwin::LaunchProcess(lldb_private::ProcessLaunchInfo &launch_info) {
lldb_private::Status PlatformDarwin::FindBundleBinaryInExecSearchPaths(
const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
- const FileSpecList *module_search_paths_ptr,
llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) {
const FileSpec &platform_file = module_spec.GetFileSpec();
- // See if the file is present in any of the module_search_paths_ptr
+ TargetSP target_sp = module_spec.GetTargetSP();
+ FileSpecList module_search_paths;
+ if (target_sp)
+ module_search_paths = target_sp->GetExecutableSearchPaths();
+ // See if the file is present in any of the module_search_paths
// directories.
- if (!module_sp && module_search_paths_ptr && platform_file) {
+ if (!module_sp && !module_search_paths.IsEmpty() && platform_file) {
// create a vector of all the file / directory names in platform_file e.g.
// this might be
// /System/Library/PrivateFrameworks/UIFoundation.framework/UIFoundation
@@ -1322,21 +1327,21 @@ lldb_private::Status PlatformDarwin::FindBundleBinaryInExecSearchPaths(
std::reverse(path_parts.begin(), path_parts.end());
const size_t path_parts_size = path_parts.size();
- size_t num_module_search_paths = module_search_paths_ptr->GetSize();
+ size_t num_module_search_paths = module_search_paths.GetSize();
for (size_t i = 0; i < num_module_search_paths; ++i) {
Log *log_verbose = GetLog(LLDBLog::Host);
LLDB_LOGF(
log_verbose,
"PlatformRemoteDarwinDevice::GetSharedModule searching for binary in "
"search-path %s",
- module_search_paths_ptr->GetFileSpecAtIndex(i).GetPath().c_str());
+ module_search_paths.GetFileSpecAtIndex(i).GetPath().c_str());
// Create a new FileSpec with this module_search_paths_ptr plus just the
// filename ("UIFoundation"), then the parent dir plus filename
// ("UIFoundation.framework/UIFoundation") etc - up to four names (to
// handle "Foo.framework/Contents/MacOS/Foo")
for (size_t j = 0; j < 4 && j < path_parts_size - 1; ++j) {
- FileSpec path_to_try(module_search_paths_ptr->GetFileSpecAtIndex(i));
+ FileSpec path_to_try(module_search_paths.GetFileSpecAtIndex(i));
// Add the components backwards. For
// .../PrivateFrameworks/UIFoundation.framework/UIFoundation path_parts
@@ -1356,9 +1361,9 @@ lldb_private::Status PlatformDarwin::FindBundleBinaryInExecSearchPaths(
if (FileSystem::Instance().Exists(path_to_try)) {
ModuleSpec new_module_spec(module_spec);
new_module_spec.GetFileSpec() = path_to_try;
- Status new_error(
- Platform::GetSharedModule(new_module_spec, process, module_sp,
- nullptr, old_modules, did_create_ptr));
+ Status new_error(Platform::GetSharedModule(new_module_spec, process,
+ module_sp, old_modules,
+ did_create_ptr));
if (module_sp) {
module_sp->SetPlatformFileSpec(path_to_try);
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h
index f8a62ce..82e69e3 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h
@@ -73,7 +73,6 @@ public:
Status GetSharedModule(const ModuleSpec &module_spec, Process *process,
lldb::ModuleSP &module_sp,
- const FileSpecList *module_search_paths_ptr,
llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
bool *did_create_ptr) override;
@@ -189,7 +188,7 @@ protected:
Status FindBundleBinaryInExecSearchPaths(
const ModuleSpec &module_spec, Process *process,
- lldb::ModuleSP &module_sp, const FileSpecList *module_search_paths_ptr,
+ lldb::ModuleSP &module_sp,
llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr);
// The OSType where lldb is running.
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp
index 68ef817..a72d94e 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp
@@ -295,7 +295,6 @@ BringInRemoteFile(Platform *platform,
lldb_private::Status PlatformDarwinDevice::GetSharedModuleWithLocalCache(
const lldb_private::ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
- const lldb_private::FileSpecList *module_search_paths_ptr,
llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) {
Log *log = GetLog(LLDBLog::Platform);
@@ -329,8 +328,7 @@ lldb_private::Status PlatformDarwinDevice::GetSharedModuleWithLocalCache(
ModuleSpec shared_cache_spec(module_spec.GetFileSpec(), image_info.uuid,
image_info.data_sp);
err = ModuleList::GetSharedModule(shared_cache_spec, module_sp,
- module_search_paths_ptr, old_modules,
- did_create_ptr);
+ old_modules, did_create_ptr);
if (module_sp) {
LLDB_LOGF(log, "[%s] module %s was found in the in-memory shared cache",
(IsHost() ? "host" : "remote"),
@@ -348,8 +346,7 @@ lldb_private::Status PlatformDarwinDevice::GetSharedModuleWithLocalCache(
FileSystem::Instance().Resolve(device_support_spec);
if (FileSystem::Instance().Exists(device_support_spec)) {
ModuleSpec local_spec(device_support_spec, module_spec.GetUUID());
- err = ModuleList::GetSharedModule(local_spec, module_sp,
- module_search_paths_ptr, old_modules,
+ err = ModuleList::GetSharedModule(local_spec, module_sp, old_modules,
did_create_ptr);
if (module_sp) {
LLDB_LOGF(log,
@@ -363,8 +360,7 @@ lldb_private::Status PlatformDarwinDevice::GetSharedModuleWithLocalCache(
}
}
- err = ModuleList::GetSharedModule(module_spec, module_sp,
- module_search_paths_ptr, old_modules,
+ err = ModuleList::GetSharedModule(module_spec, module_sp, old_modules,
did_create_ptr);
if (module_sp)
return err;
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.h b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.h
index e1eba08f..e0142ab 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.h
@@ -26,7 +26,6 @@ public:
protected:
virtual Status GetSharedModuleWithLocalCache(
const ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
- const FileSpecList *module_search_paths_ptr,
llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr);
struct SDKDirectoryInfo {
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
index 07c5a52..04e87b9d 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
@@ -719,7 +719,6 @@ void PlatformDarwinKernel::UpdateKextandKernelsLocalScan() {
Status PlatformDarwinKernel::GetSharedModule(
const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
- const FileSpecList *module_search_paths_ptr,
llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) {
Status error;
module_sp.reset();
@@ -734,14 +733,12 @@ Status PlatformDarwinKernel::GetSharedModule(
// UUID search can get here with no name - and it may be a kernel.
if (kext_bundle_id == "mach_kernel" || kext_bundle_id.empty()) {
error = GetSharedModuleKernel(module_spec, process, module_sp,
- module_search_paths_ptr, old_modules,
- did_create_ptr);
+ old_modules, did_create_ptr);
if (error.Success() && module_sp) {
return error;
}
} else {
- return GetSharedModuleKext(module_spec, process, module_sp,
- module_search_paths_ptr, old_modules,
+ return GetSharedModuleKext(module_spec, process, module_sp, old_modules,
did_create_ptr);
}
}
@@ -749,13 +746,11 @@ Status PlatformDarwinKernel::GetSharedModule(
// Give the generic methods, including possibly calling into DebugSymbols
// framework on macOS systems, a chance.
return PlatformDarwin::GetSharedModule(module_spec, process, module_sp,
- module_search_paths_ptr, old_modules,
- did_create_ptr);
+ old_modules, did_create_ptr);
}
Status PlatformDarwinKernel::GetSharedModuleKext(
const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
- const FileSpecList *module_search_paths_ptr,
llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) {
Status error;
module_sp.reset();
@@ -782,8 +777,7 @@ Status PlatformDarwinKernel::GetSharedModuleKext(
// Give the generic methods, including possibly calling into DebugSymbols
// framework on macOS systems, a chance.
error = PlatformDarwin::GetSharedModule(module_spec, process, module_sp,
- module_search_paths_ptr, old_modules,
- did_create_ptr);
+ old_modules, did_create_ptr);
if (error.Success() && module_sp.get()) {
return error;
}
@@ -793,7 +787,6 @@ Status PlatformDarwinKernel::GetSharedModuleKext(
Status PlatformDarwinKernel::GetSharedModuleKernel(
const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
- const FileSpecList *module_search_paths_ptr,
llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) {
assert(module_sp.get() == nullptr);
UpdateKextandKernelsLocalScan();
@@ -848,8 +841,7 @@ Status PlatformDarwinKernel::GetSharedModuleKernel(
// Give the generic methods, including possibly calling into DebugSymbols
// framework on macOS systems, a chance.
return PlatformDarwin::GetSharedModule(module_spec, process, module_sp,
- module_search_paths_ptr, old_modules,
- did_create_ptr);
+ old_modules, did_create_ptr);
}
std::vector<lldb_private::FileSpec>
@@ -888,8 +880,8 @@ Status PlatformDarwinKernel::ExamineKextForMatchingUUID(
ModuleSP module_sp(new Module(exe_spec));
if (module_sp && module_sp->GetObjectFile() &&
module_sp->MatchesModuleSpec(exe_spec)) {
- Status error = ModuleList::GetSharedModule(exe_spec, exe_module_sp,
- NULL, NULL, NULL);
+ Status error =
+ ModuleList::GetSharedModule(exe_spec, exe_module_sp, NULL, NULL);
if (exe_module_sp && exe_module_sp->GetObjectFile()) {
return error;
}
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h
index 9db9c00..b5cf701 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h
@@ -60,7 +60,6 @@ public:
Status GetSharedModule(const ModuleSpec &module_spec, Process *process,
lldb::ModuleSP &module_sp,
- const FileSpecList *module_search_paths_ptr,
llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
bool *did_create_ptr) override;
@@ -142,14 +141,14 @@ protected:
Status GetSharedModuleKext(const ModuleSpec &module_spec, Process *process,
lldb::ModuleSP &module_sp,
- const FileSpecList *module_search_paths_ptr,
llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
bool *did_create_ptr);
- Status GetSharedModuleKernel(
- const ModuleSpec &module_spec, Process *process,
- lldb::ModuleSP &module_sp, const FileSpecList *module_search_paths_ptr,
- llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr);
+ Status
+ GetSharedModuleKernel(const ModuleSpec &module_spec, Process *process,
+ lldb::ModuleSP &module_sp,
+ llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
+ bool *did_create_ptr);
Status ExamineKextForMatchingUUID(const FileSpec &kext_bundle_path,
const UUID &uuid, const ArchSpec &arch,
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
index dad6dcd..e6ea75a 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
@@ -182,10 +182,8 @@ PlatformMacOSX::GetSupportedArchitectures(const ArchSpec &process_host_arch) {
lldb_private::Status PlatformMacOSX::GetSharedModule(
const lldb_private::ModuleSpec &module_spec, Process *process,
lldb::ModuleSP &module_sp,
- const lldb_private::FileSpecList *module_search_paths_ptr,
llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) {
Status error = GetSharedModuleWithLocalCache(module_spec, module_sp,
- module_search_paths_ptr,
old_modules, did_create_ptr);
if (module_sp) {
@@ -199,9 +197,9 @@ lldb_private::Status PlatformMacOSX::GetSharedModule(
lldb::ModuleSP x86_64_module_sp;
llvm::SmallVector<lldb::ModuleSP, 1> old_x86_64_modules;
bool did_create = false;
- Status x86_64_error = GetSharedModuleWithLocalCache(
- module_spec_x86_64, x86_64_module_sp, module_search_paths_ptr,
- &old_x86_64_modules, &did_create);
+ Status x86_64_error =
+ GetSharedModuleWithLocalCache(module_spec_x86_64, x86_64_module_sp,
+ &old_x86_64_modules, &did_create);
if (x86_64_module_sp && x86_64_module_sp->GetObjectFile()) {
module_sp = x86_64_module_sp;
if (old_modules)
@@ -217,7 +215,6 @@ lldb_private::Status PlatformMacOSX::GetSharedModule(
if (!module_sp) {
error = FindBundleBinaryInExecSearchPaths(module_spec, process, module_sp,
- module_search_paths_ptr,
old_modules, did_create_ptr);
}
return error;
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.h b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.h
index be84485..9555b16 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.h
@@ -48,7 +48,6 @@ public:
Status GetSharedModule(const ModuleSpec &module_spec, Process *process,
lldb::ModuleSP &module_sp,
- const FileSpecList *module_search_paths_ptr,
llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
bool *did_create_ptr) override;
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp
index b83d07b..53fab93 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp
@@ -53,7 +53,7 @@ void PlatformRemoteDarwinDevice::GetStatus(Stream &strm) {
if (sdk_directory)
strm.Printf(" SDK Path: \"%s\"\n", sdk_directory);
else
- strm.PutCString(" SDK Path: error: unable to locate SDK\n");
+ strm.PutCString(" SDK Path: <unable to locate SDK>\n");
const uint32_t num_sdk_infos = m_sdk_directory_infos.size();
for (uint32_t i = 0; i < num_sdk_infos; ++i) {
@@ -158,7 +158,6 @@ Status PlatformRemoteDarwinDevice::GetSymbolFile(const FileSpec &platform_file,
Status PlatformRemoteDarwinDevice::GetSharedModule(
const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
- const FileSpecList *module_search_paths_ptr,
llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) {
// For iOS, the SDK files are all cached locally on the host system. So first
// we ask for the file in the cached SDK, then we attempt to get a shared
@@ -185,7 +184,7 @@ Status PlatformRemoteDarwinDevice::GetSharedModule(
if (GetFileInSDK(platform_file_path, connected_sdk_idx,
platform_module_spec.GetFileSpec())) {
module_sp.reset();
- error = ResolveExecutable(platform_module_spec, module_sp, nullptr);
+ error = ResolveExecutable(platform_module_spec, module_sp);
if (module_sp) {
m_last_module_sdk_idx = connected_sdk_idx;
error.Clear();
@@ -202,7 +201,7 @@ Status PlatformRemoteDarwinDevice::GetSharedModule(
if (GetFileInSDK(platform_file_path, m_last_module_sdk_idx,
platform_module_spec.GetFileSpec())) {
module_sp.reset();
- error = ResolveExecutable(platform_module_spec, module_sp, nullptr);
+ error = ResolveExecutable(platform_module_spec, module_sp);
if (module_sp) {
error.Clear();
return error;
@@ -224,7 +223,7 @@ Status PlatformRemoteDarwinDevice::GetSharedModule(
if (GetFileInSDK(platform_file_path, current_sdk_idx,
platform_module_spec.GetFileSpec())) {
module_sp.reset();
- error = ResolveExecutable(platform_module_spec, module_sp, nullptr);
+ error = ResolveExecutable(platform_module_spec, module_sp);
if (module_sp) {
m_last_module_sdk_idx = current_sdk_idx;
error.Clear();
@@ -245,7 +244,7 @@ Status PlatformRemoteDarwinDevice::GetSharedModule(
platform_module_spec.GetFileSpec())) {
// printf ("sdk[%u]: '%s'\n", sdk_idx, local_file.GetPath().c_str());
- error = ResolveExecutable(platform_module_spec, module_sp, nullptr);
+ error = ResolveExecutable(platform_module_spec, module_sp);
if (module_sp) {
// Remember the index of the last SDK that we found a file in in case
// the wrong SDK was selected.
@@ -261,8 +260,7 @@ Status PlatformRemoteDarwinDevice::GetSharedModule(
// This may not be an SDK-related module. Try whether we can bring in the
// thing to our local cache.
- error = GetSharedModuleWithLocalCache(module_spec, module_sp,
- module_search_paths_ptr, old_modules,
+ error = GetSharedModuleWithLocalCache(module_spec, module_sp, old_modules,
did_create_ptr);
if (error.Success())
return error;
@@ -271,15 +269,13 @@ Status PlatformRemoteDarwinDevice::GetSharedModule(
// directories.
if (!module_sp)
error = PlatformDarwin::FindBundleBinaryInExecSearchPaths(
- module_spec, process, module_sp, module_search_paths_ptr, old_modules,
- did_create_ptr);
+ module_spec, process, module_sp, old_modules, did_create_ptr);
if (error.Success())
return error;
const bool always_create = false;
- error = ModuleList::GetSharedModule(module_spec, module_sp,
- module_search_paths_ptr, old_modules,
+ error = ModuleList::GetSharedModule(module_spec, module_sp, old_modules,
did_create_ptr, always_create);
if (module_sp)
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h
index 557f487..4abd74e 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h
@@ -47,7 +47,6 @@ public:
Status GetSharedModule(const ModuleSpec &module_spec, Process *process,
lldb::ModuleSP &module_sp,
- const FileSpecList *module_search_paths_ptr,
llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
bool *did_create_ptr) override;
diff --git a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
index b7029fb..f8e33ea 100644
--- a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
+++ b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
@@ -84,8 +84,9 @@ bool ProcessElfCore::CanDebug(lldb::TargetSP target_sp,
// For now we are just making sure the file exists for a given module
if (!m_core_module_sp && FileSystem::Instance().Exists(m_core_file)) {
ModuleSpec core_module_spec(m_core_file, target_sp->GetArchitecture());
+ core_module_spec.SetTarget(target_sp);
Status error(ModuleList::GetSharedModule(core_module_spec, m_core_module_sp,
- nullptr, nullptr, nullptr));
+ nullptr, nullptr));
if (m_core_module_sp) {
ObjectFile *core_objfile = m_core_module_sp->GetObjectFile();
if (core_objfile && core_objfile->GetType() == ObjectFile::eTypeCoreFile)
diff --git a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
index a780b3f..83d684e 100644
--- a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
+++ b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
@@ -95,8 +95,9 @@ bool ProcessMachCore::CanDebug(lldb::TargetSP target_sp,
// header but we should still try to use it -
// ModuleSpecList::FindMatchingModuleSpec enforces a strict arch mach.
ModuleSpec core_module_spec(m_core_file);
+ core_module_spec.SetTarget(target_sp);
Status error(ModuleList::GetSharedModule(core_module_spec, m_core_module_sp,
- nullptr, nullptr, nullptr));
+ nullptr, nullptr));
if (m_core_module_sp) {
ObjectFile *core_objfile = m_core_module_sp->GetObjectFile();
diff --git a/lldb/source/Plugins/Process/scripted/ScriptedFrame.h b/lldb/source/Plugins/Process/scripted/ScriptedFrame.h
index 6e01e2f..b6b77c4 100644
--- a/lldb/source/Plugins/Process/scripted/ScriptedFrame.h
+++ b/lldb/source/Plugins/Process/scripted/ScriptedFrame.h
@@ -9,7 +9,6 @@
#ifndef LLDB_SOURCE_PLUGINS_SCRIPTED_FRAME_H
#define LLDB_SOURCE_PLUGINS_SCRIPTED_FRAME_H
-#include "Plugins/Process/Utility/RegisterContextMemory.h"
#include "ScriptedThread.h"
#include "lldb/Interpreter/ScriptInterpreter.h"
#include "lldb/Target/DynamicRegisterInfo.h"
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
index 0910357..50569cd 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
@@ -23,6 +23,7 @@ add_lldb_library(lldbPluginScriptInterpreterPythonInterfaces PLUGIN
OperatingSystemPythonInterface.cpp
ScriptInterpreterPythonInterfaces.cpp
ScriptedFramePythonInterface.cpp
+ ScriptedFrameProviderPythonInterface.cpp
ScriptedPlatformPythonInterface.cpp
ScriptedProcessPythonInterface.cpp
ScriptedPythonInterface.cpp
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.h
index 3814f46..b2a3479 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.h
@@ -17,6 +17,7 @@
#include "OperatingSystemPythonInterface.h"
#include "ScriptedBreakpointPythonInterface.h"
+#include "ScriptedFrameProviderPythonInterface.h"
#include "ScriptedFramePythonInterface.h"
#include "ScriptedPlatformPythonInterface.h"
#include "ScriptedProcessPythonInterface.h"
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp
new file mode 100644
index 0000000..b866bf3
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp
@@ -0,0 +1,57 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/Config.h"
+#include "lldb/Target/Thread.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/lldb-enumerations.h"
+
+#if LLDB_ENABLE_PYTHON
+
+// LLDB Python header must be included first
+#include "../lldb-python.h"
+
+#include "../SWIGPythonBridge.h"
+#include "../ScriptInterpreterPythonImpl.h"
+#include "ScriptedFrameProviderPythonInterface.h"
+#include <optional>
+
+using namespace lldb;
+using namespace lldb_private;
+using namespace lldb_private::python;
+using Locker = ScriptInterpreterPythonImpl::Locker;
+
+ScriptedFrameProviderPythonInterface::ScriptedFrameProviderPythonInterface(
+ ScriptInterpreterPythonImpl &interpreter)
+ : ScriptedFrameProviderInterface(), ScriptedPythonInterface(interpreter) {}
+
+llvm::Expected<StructuredData::GenericSP>
+ScriptedFrameProviderPythonInterface::CreatePluginObject(
+ const llvm::StringRef class_name, lldb::StackFrameListSP input_frames,
+ StructuredData::DictionarySP args_sp) {
+ if (!input_frames)
+ return llvm::createStringError("Invalid frame list");
+
+ StructuredDataImpl sd_impl(args_sp);
+ return ScriptedPythonInterface::CreatePluginObject(class_name, nullptr,
+ input_frames, sd_impl);
+}
+
+StructuredData::ObjectSP
+ScriptedFrameProviderPythonInterface::GetFrameAtIndex(uint32_t index) {
+ Status error;
+ StructuredData::ObjectSP obj = Dispatch("get_frame_at_index", error, index);
+
+ if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+ error))
+ return {};
+
+ return obj;
+}
+
+#endif
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.h
new file mode 100644
index 0000000..fd16398
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.h
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDFRAMEPROVIDERPYTHONINTERFACE_H
+#define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDFRAMEPROVIDERPYTHONINTERFACE_H
+
+#include "lldb/Host/Config.h"
+
+#if LLDB_ENABLE_PYTHON
+
+#include "ScriptedPythonInterface.h"
+#include "lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h"
+#include <optional>
+
+namespace lldb_private {
+class ScriptedFrameProviderPythonInterface
+ : public ScriptedFrameProviderInterface,
+ public ScriptedPythonInterface {
+public:
+ ScriptedFrameProviderPythonInterface(
+ ScriptInterpreterPythonImpl &interpreter);
+
+ llvm::Expected<StructuredData::GenericSP>
+ CreatePluginObject(llvm::StringRef class_name,
+ lldb::StackFrameListSP input_frames,
+ StructuredData::DictionarySP args_sp) override;
+
+ llvm::SmallVector<AbstractMethodRequirement>
+ GetAbstractMethodRequirements() const override {
+ return llvm::SmallVector<AbstractMethodRequirement>(
+ {{"get_frame_at_index"}});
+ }
+
+ StructuredData::ObjectSP GetFrameAtIndex(uint32_t index) override;
+};
+} // namespace lldb_private
+
+#endif // LLDB_ENABLE_PYTHON
+#endif // LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDFRAMEPROVIDERPYTHONINTERFACE_H
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
index 4fdf2b1..af2e0b5 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
@@ -243,4 +243,21 @@ ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::DescriptionLevel>(
return static_cast<lldb::DescriptionLevel>(unsigned_val);
}
+template <>
+lldb::StackFrameListSP
+ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::StackFrameListSP>(
+ python::PythonObject &p, Status &error) {
+
+ lldb::SBFrameList *sb_frame_list = reinterpret_cast<lldb::SBFrameList *>(
+ python::LLDBSWIGPython_CastPyObjectToSBFrameList(p.get()));
+
+ if (!sb_frame_list) {
+ error = Status::FromErrorStringWithFormat(
+ "couldn't cast lldb::SBFrameList to lldb::StackFrameListSP.");
+ return {};
+ }
+
+ return m_interpreter.GetOpaqueTypeFromSBFrameList(*sb_frame_list);
+}
+
#endif
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
index 2335b2e..ec1dd99 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
@@ -444,6 +444,14 @@ protected:
return python::SWIGBridge::ToSWIGWrapper(arg);
}
+ python::PythonObject Transform(lldb::ThreadSP arg) {
+ return python::SWIGBridge::ToSWIGWrapper(arg);
+ }
+
+ python::PythonObject Transform(lldb::StackFrameListSP arg) {
+ return python::SWIGBridge::ToSWIGWrapper(arg);
+ }
+
python::PythonObject Transform(lldb::ThreadPlanSP arg) {
return python::SWIGBridge::ToSWIGWrapper(arg);
}
@@ -628,6 +636,11 @@ lldb::DescriptionLevel
ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::DescriptionLevel>(
python::PythonObject &p, Status &error);
+template <>
+lldb::StackFrameListSP
+ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::StackFrameListSP>(
+ python::PythonObject &p, Status &error);
+
} // namespace lldb_private
#endif // LLDB_ENABLE_PYTHON
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
index 27f5d2e..2c97126 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
@@ -93,6 +93,7 @@ public:
static PythonObject ToSWIGWrapper(const StructuredDataImpl &data_impl);
static PythonObject ToSWIGWrapper(lldb::ThreadSP thread_sp);
static PythonObject ToSWIGWrapper(lldb::StackFrameSP frame_sp);
+ static PythonObject ToSWIGWrapper(lldb::StackFrameListSP frames_sp);
static PythonObject ToSWIGWrapper(lldb::DebuggerSP debugger_sp);
static PythonObject ToSWIGWrapper(lldb::WatchpointSP watchpoint_sp);
static PythonObject ToSWIGWrapper(lldb::BreakpointLocationSP bp_loc_sp);
@@ -269,6 +270,7 @@ void *LLDBSWIGPython_CastPyObjectToSBSymbolContext(PyObject *data);
void *LLDBSWIGPython_CastPyObjectToSBValue(PyObject *data);
void *LLDBSWIGPython_CastPyObjectToSBMemoryRegionInfo(PyObject *data);
void *LLDBSWIGPython_CastPyObjectToSBExecutionContext(PyObject *data);
+void *LLDBSWIGPython_CastPyObjectToSBFrameList(PyObject *data);
} // namespace python
} // namespace lldb_private
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index d257a08..3493fa9 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -1526,6 +1526,11 @@ ScriptInterpreterPythonImpl::CreateScriptedFrameInterface() {
return std::make_shared<ScriptedFramePythonInterface>(*this);
}
+ScriptedFrameProviderInterfaceSP
+ScriptInterpreterPythonImpl::CreateScriptedFrameProviderInterface() {
+ return std::make_shared<ScriptedFrameProviderPythonInterface>(*this);
+}
+
ScriptedThreadPlanInterfaceSP
ScriptInterpreterPythonImpl::CreateScriptedThreadPlanInterface() {
return std::make_shared<ScriptedThreadPlanPythonInterface>(*this);
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
index 00ae59c..ad2ddd2 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
@@ -101,6 +101,9 @@ public:
lldb::ScriptedFrameInterfaceSP CreateScriptedFrameInterface() override;
+ lldb::ScriptedFrameProviderInterfaceSP
+ CreateScriptedFrameProviderInterface() override;
+
lldb::ScriptedThreadPlanInterfaceSP
CreateScriptedThreadPlanInterface() override;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 881268b..f00e94ae 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -2018,7 +2018,7 @@ void SymbolFileDWARF::UpdateExternalModuleListIfNeeded() {
}
Status error = ModuleList::GetSharedModule(dwo_module_spec, module_sp,
- nullptr, nullptr, nullptr);
+ nullptr, nullptr);
if (!module_sp) {
// ReportWarning also rate-limits based on the warning string,
// but in a -gmodules build, each object file has a similar DAG
diff --git a/lldb/source/Target/CMakeLists.txt b/lldb/source/Target/CMakeLists.txt
index 8e6d51e..cff5904 100644
--- a/lldb/source/Target/CMakeLists.txt
+++ b/lldb/source/Target/CMakeLists.txt
@@ -38,6 +38,7 @@ add_lldb_library(lldbTarget
RegisterNumber.cpp
RemoteAwarePlatform.cpp
ScriptedThreadPlan.cpp
+ SyntheticFrameProvider.cpp
SectionLoadHistory.cpp
SectionLoadList.cpp
StackFrame.cpp
diff --git a/lldb/source/Target/ModuleCache.cpp b/lldb/source/Target/ModuleCache.cpp
index f737836..9978946 100644
--- a/lldb/source/Target/ModuleCache.cpp
+++ b/lldb/source/Target/ModuleCache.cpp
@@ -255,7 +255,7 @@ Status ModuleCache::Get(const FileSpec &root_dir_spec, const char *hostname,
cached_module_spec.GetPlatformFileSpec() = module_spec.GetFileSpec();
error = ModuleList::GetSharedModule(cached_module_spec, cached_module_sp,
- nullptr, nullptr, did_create_ptr, false);
+ nullptr, did_create_ptr, false);
if (error.Fail())
return error;
diff --git a/lldb/source/Target/Platform.cpp b/lldb/source/Target/Platform.cpp
index 8681ada..5b0930c 100644
--- a/lldb/source/Target/Platform.cpp
+++ b/lldb/source/Target/Platform.cpp
@@ -163,11 +163,12 @@ Platform::LocateExecutableScriptingResources(Target *target, Module &module,
Status Platform::GetSharedModule(
const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
- const FileSpecList *module_search_paths_ptr,
llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) {
if (IsHost())
- return ModuleList::GetSharedModule(module_spec, module_sp,
- module_search_paths_ptr, old_modules,
+ // Note: module_search_paths_ptr functionality is now handled internally
+ // by getting target from module_spec and calling
+ // target->GetExecutableSearchPaths()
+ return ModuleList::GetSharedModule(module_spec, module_sp, old_modules,
did_create_ptr, false);
// Module resolver lambda.
@@ -180,16 +181,14 @@ Status Platform::GetSharedModule(
resolved_spec = spec;
resolved_spec.GetFileSpec().PrependPathComponent(m_sdk_sysroot);
// Try to get shared module with resolved spec.
- error = ModuleList::GetSharedModule(resolved_spec, module_sp,
- module_search_paths_ptr, old_modules,
+ error = ModuleList::GetSharedModule(resolved_spec, module_sp, old_modules,
did_create_ptr, false);
}
// If we don't have sysroot or it didn't work then
// try original module spec.
if (!error.Success()) {
resolved_spec = spec;
- error = ModuleList::GetSharedModule(resolved_spec, module_sp,
- module_search_paths_ptr, old_modules,
+ error = ModuleList::GetSharedModule(resolved_spec, module_sp, old_modules,
did_create_ptr, false);
}
if (error.Success() && module_sp)
@@ -731,10 +730,8 @@ bool Platform::SetOSVersion(llvm::VersionTuple version) {
return false;
}
-Status
-Platform::ResolveExecutable(const ModuleSpec &module_spec,
- lldb::ModuleSP &exe_module_sp,
- const FileSpecList *module_search_paths_ptr) {
+Status Platform::ResolveExecutable(const ModuleSpec &module_spec,
+ lldb::ModuleSP &exe_module_sp) {
// We may connect to a process and use the provided executable (Don't use
// local $PATH).
@@ -750,9 +747,8 @@ Platform::ResolveExecutable(const ModuleSpec &module_spec,
if (resolved_module_spec.GetArchitecture().IsValid() ||
resolved_module_spec.GetUUID().IsValid()) {
- Status error =
- ModuleList::GetSharedModule(resolved_module_spec, exe_module_sp,
- module_search_paths_ptr, nullptr, nullptr);
+ Status error = ModuleList::GetSharedModule(resolved_module_spec,
+ exe_module_sp, nullptr, nullptr);
if (exe_module_sp && exe_module_sp->GetObjectFile())
return error;
@@ -767,9 +763,9 @@ Platform::ResolveExecutable(const ModuleSpec &module_spec,
Status error;
for (const ArchSpec &arch : GetSupportedArchitectures(process_host_arch)) {
resolved_module_spec.GetArchitecture() = arch;
- error =
- ModuleList::GetSharedModule(resolved_module_spec, exe_module_sp,
- module_search_paths_ptr, nullptr, nullptr);
+
+ error = ModuleList::GetSharedModule(resolved_module_spec, exe_module_sp,
+ nullptr, nullptr);
if (error.Success()) {
if (exe_module_sp && exe_module_sp->GetObjectFile())
break;
@@ -1446,16 +1442,13 @@ const std::vector<ConstString> &Platform::GetTrapHandlerSymbolNames() {
return m_trap_handlers;
}
-Status
-Platform::GetCachedExecutable(ModuleSpec &module_spec,
- lldb::ModuleSP &module_sp,
- const FileSpecList *module_search_paths_ptr) {
+Status Platform::GetCachedExecutable(ModuleSpec &module_spec,
+ lldb::ModuleSP &module_sp) {
FileSpec platform_spec = module_spec.GetFileSpec();
Status error = GetRemoteSharedModule(
module_spec, nullptr, module_sp,
[&](const ModuleSpec &spec) {
- return Platform::ResolveExecutable(spec, module_sp,
- module_search_paths_ptr);
+ return Platform::ResolveExecutable(spec, module_sp);
},
nullptr);
if (error.Success()) {
@@ -1497,7 +1490,7 @@ Status Platform::GetRemoteSharedModule(const ModuleSpec &module_spec,
for (const ArchSpec &arch : GetSupportedArchitectures(process_host_arch)) {
arch_module_spec.GetArchitecture() = arch;
error = ModuleList::GetSharedModule(arch_module_spec, module_sp, nullptr,
- nullptr, nullptr);
+ nullptr);
// Did we find an executable using one of the
if (error.Success() && module_sp)
break;
@@ -1673,11 +1666,12 @@ void Platform::CallLocateModuleCallbackIfSet(const ModuleSpec &module_spec,
cached_module_spec.GetUUID().Clear(); // Clear UUID since it may contain md5
// content hash instead of real UUID.
cached_module_spec.GetFileSpec() = module_file_spec;
+ cached_module_spec.GetSymbolFileSpec() = symbol_file_spec;
cached_module_spec.GetPlatformFileSpec() = module_spec.GetFileSpec();
cached_module_spec.SetObjectOffset(0);
error = ModuleList::GetSharedModule(cached_module_spec, module_sp, nullptr,
- nullptr, did_create_ptr, false);
+ did_create_ptr, false, false);
if (error.Success() && module_sp) {
// Succeeded to load the module file.
LLDB_LOGF(log, "%s: locate module callback succeeded: module=%s symbol=%s",
diff --git a/lldb/source/Target/RemoteAwarePlatform.cpp b/lldb/source/Target/RemoteAwarePlatform.cpp
index cac738e..89b946b 100644
--- a/lldb/source/Target/RemoteAwarePlatform.cpp
+++ b/lldb/source/Target/RemoteAwarePlatform.cpp
@@ -29,9 +29,8 @@ bool RemoteAwarePlatform::GetModuleSpec(const FileSpec &module_file_spec,
return false;
}
-Status RemoteAwarePlatform::ResolveExecutable(
- const ModuleSpec &module_spec, lldb::ModuleSP &exe_module_sp,
- const FileSpecList *module_search_paths_ptr) {
+Status RemoteAwarePlatform::ResolveExecutable(const ModuleSpec &module_spec,
+ lldb::ModuleSP &exe_module_sp) {
ModuleSpec resolved_module_spec(module_spec);
// The host platform can resolve the path more aggressively.
@@ -47,12 +46,10 @@ Status RemoteAwarePlatform::ResolveExecutable(
if (!FileSystem::Instance().Exists(resolved_file_spec))
FileSystem::Instance().ResolveExecutableLocation(resolved_file_spec);
} else if (m_remote_platform_sp) {
- return GetCachedExecutable(resolved_module_spec, exe_module_sp,
- module_search_paths_ptr);
+ return GetCachedExecutable(resolved_module_spec, exe_module_sp);
}
- return Platform::ResolveExecutable(resolved_module_spec, exe_module_sp,
- module_search_paths_ptr);
+ return Platform::ResolveExecutable(resolved_module_spec, exe_module_sp);
}
Status RemoteAwarePlatform::RunShellCommand(
diff --git a/lldb/source/Target/SyntheticFrameProvider.cpp b/lldb/source/Target/SyntheticFrameProvider.cpp
new file mode 100644
index 0000000..241ce82
--- /dev/null
+++ b/lldb/source/Target/SyntheticFrameProvider.cpp
@@ -0,0 +1,100 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Target/SyntheticFrameProvider.h"
+#include "lldb/Core/PluginManager.h"
+#include "lldb/Target/Thread.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+SyntheticFrameProvider::SyntheticFrameProvider(StackFrameListSP input_frames)
+ : m_input_frames(std::move(input_frames)) {}
+
+SyntheticFrameProvider::~SyntheticFrameProvider() = default;
+
+void SyntheticFrameProviderDescriptor::Dump(Stream *s) const {
+ if (!s)
+ return;
+
+ s->Printf(" Name: %s\n", GetName().str().c_str());
+
+ // Show thread filter information.
+ if (thread_specs.empty()) {
+ s->PutCString(" Thread Filter: (applies to all threads)\n");
+ } else {
+ s->Printf(" Thread Filter: %zu specification(s)\n", thread_specs.size());
+ for (size_t i = 0; i < thread_specs.size(); ++i) {
+ const ThreadSpec &spec = thread_specs[i];
+ s->Printf(" [%zu] ", i);
+ spec.GetDescription(s, lldb::eDescriptionLevelVerbose);
+ s->PutChar('\n');
+ }
+ }
+}
+
+llvm::Expected<SyntheticFrameProviderSP> SyntheticFrameProvider::CreateInstance(
+ StackFrameListSP input_frames,
+ const SyntheticFrameProviderDescriptor &descriptor) {
+ if (!input_frames)
+ return llvm::createStringError(
+ "cannot create synthetic frame provider: invalid input frames");
+
+ // Iterate through all registered ScriptedFrameProvider plugins.
+ ScriptedFrameProviderCreateInstance create_callback = nullptr;
+ for (uint32_t idx = 0;
+ (create_callback =
+ PluginManager::GetScriptedFrameProviderCreateCallbackAtIndex(
+ idx)) != nullptr;
+ ++idx) {
+ auto provider_or_err = create_callback(input_frames, descriptor);
+ if (!provider_or_err) {
+ LLDB_LOG_ERROR(GetLog(LLDBLog::Target), provider_or_err.takeError(),
+ "Failed to create synthetic frame provider: {0}");
+ continue;
+ }
+
+ if (auto frame_provider_up = std::move(*provider_or_err))
+ return std::move(frame_provider_up);
+ }
+
+ return llvm::createStringError(
+ "cannot create synthetic frame provider: no suitable plugin found");
+}
+
+llvm::Expected<SyntheticFrameProviderSP> SyntheticFrameProvider::CreateInstance(
+ StackFrameListSP input_frames, llvm::StringRef plugin_name,
+ const std::vector<ThreadSpec> &thread_specs) {
+ if (!input_frames)
+ return llvm::createStringError(
+ "cannot create synthetic frame provider: invalid input frames");
+
+ // Look up the specific C++ plugin by name.
+ SyntheticFrameProviderCreateInstance create_callback =
+ PluginManager::GetSyntheticFrameProviderCreateCallbackForPluginName(
+ plugin_name);
+
+ if (!create_callback)
+ return llvm::createStringError(
+ "cannot create synthetic frame provider: C++ plugin '%s' not found",
+ plugin_name.str().c_str());
+
+ auto provider_or_err = create_callback(input_frames, thread_specs);
+ if (!provider_or_err)
+ return provider_or_err.takeError();
+
+ if (auto frame_provider_sp = std::move(*provider_or_err))
+ return std::move(frame_provider_sp);
+
+ return llvm::createStringError(
+ "cannot create synthetic frame provider: C++ plugin '%s' returned null",
+ plugin_name.str().c_str());
+}
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index e53fc7a..ae6c4f7 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -1779,9 +1779,9 @@ bool Target::SetArchitecture(const ArchSpec &arch_spec, bool set_platform,
arch_spec.GetArchitectureName(),
arch_spec.GetTriple().getTriple().c_str());
ModuleSpec module_spec(executable_sp->GetFileSpec(), other);
- FileSpecList search_paths = GetExecutableSearchPaths();
+ module_spec.SetTarget(shared_from_this());
Status error = ModuleList::GetSharedModule(module_spec, executable_sp,
- &search_paths, nullptr, nullptr);
+ nullptr, nullptr);
if (!error.Fail() && executable_sp) {
SetExecutableModule(executable_sp, eLoadDependentsYes);
@@ -2350,6 +2350,7 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &orig_module_spec,
// Apply any remappings specified in target.object-map:
ModuleSpec module_spec(orig_module_spec);
+ module_spec.SetTarget(shared_from_this());
PathMappingList &obj_mapping = GetObjectPathMap();
if (std::optional<FileSpec> remapped_obj_file =
obj_mapping.RemapPath(orig_module_spec.GetFileSpec().GetPath(),
@@ -2408,9 +2409,9 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &orig_module_spec,
transformed_spec.GetFileSpec().SetDirectory(transformed_dir);
transformed_spec.GetFileSpec().SetFilename(
module_spec.GetFileSpec().GetFilename());
+ transformed_spec.SetTarget(shared_from_this());
error = ModuleList::GetSharedModule(transformed_spec, module_sp,
- &search_paths, &old_modules,
- &did_create_module);
+ &old_modules, &did_create_module);
}
}
}
@@ -2426,9 +2427,8 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &orig_module_spec,
// cache.
if (module_spec.GetUUID().IsValid()) {
// We have a UUID, it is OK to check the global module list...
- error =
- ModuleList::GetSharedModule(module_spec, module_sp, &search_paths,
- &old_modules, &did_create_module);
+ error = ModuleList::GetSharedModule(module_spec, module_sp,
+ &old_modules, &did_create_module);
}
if (!module_sp) {
@@ -2436,8 +2436,8 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &orig_module_spec,
// module in the shared module cache.
if (m_platform_sp) {
error = m_platform_sp->GetSharedModule(
- module_spec, m_process_sp.get(), module_sp, &search_paths,
- &old_modules, &did_create_module);
+ module_spec, m_process_sp.get(), module_sp, &old_modules,
+ &did_create_module);
} else {
error = Status::FromErrorString("no platform is currently set");
}
diff --git a/lldb/source/Target/TargetList.cpp b/lldb/source/Target/TargetList.cpp
index 188c250..2e03bc1 100644
--- a/lldb/source/Target/TargetList.cpp
+++ b/lldb/source/Target/TargetList.cpp
@@ -304,13 +304,9 @@ Status TargetList::CreateTargetInternal(Debugger &debugger,
ModuleSP exe_module_sp;
if (platform_sp) {
- FileSpecList executable_search_paths(
- Target::GetDefaultExecutableSearchPaths());
ModuleSpec module_spec(file, arch);
- error = platform_sp->ResolveExecutable(module_spec, exe_module_sp,
- executable_search_paths.GetSize()
- ? &executable_search_paths
- : nullptr);
+ module_spec.SetTarget(target_sp);
+ error = platform_sp->ResolveExecutable(module_spec, exe_module_sp);
}
if (error.Success() && exe_module_sp) {
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py
index 07d6c96..ca2d2d6 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py
@@ -9,6 +9,8 @@ from lldbsuite.test import lldbutil
class StdMapDataFormatterTestCase(TestBase):
+ TEST_WITH_PDB_DEBUG_INFO = True
+
def setUp(self):
TestBase.setUp(self)
ns = "ndk" if lldbplatformutil.target_is_android() else ""
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multimap/TestDataFormatterGenericMultiMap.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multimap/TestDataFormatterGenericMultiMap.py
index 7ac7971..4b0854b 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multimap/TestDataFormatterGenericMultiMap.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multimap/TestDataFormatterGenericMultiMap.py
@@ -11,6 +11,8 @@ from lldbsuite.test import lldbutil
class GenericMultiMapDataFormatterTestCase(TestBase):
+ TEST_WITH_PDB_DEBUG_INFO = True
+
def setUp(self):
TestBase.setUp(self)
self.namespace = "std"
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multiset/TestDataFormatterGenericMultiSet.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multiset/TestDataFormatterGenericMultiSet.py
index 7e922fc..e846e07 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multiset/TestDataFormatterGenericMultiSet.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multiset/TestDataFormatterGenericMultiSet.py
@@ -10,6 +10,8 @@ from lldbsuite.test import lldbutil
class GenericMultiSetDataFormatterTestCase(TestBase):
+ TEST_WITH_PDB_DEBUG_INFO = True
+
def setUp(self):
TestBase.setUp(self)
self.namespace = "std"
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/set/TestDataFormatterGenericSet.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/set/TestDataFormatterGenericSet.py
index 1ac5e32..355f0c6 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/set/TestDataFormatterGenericSet.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/set/TestDataFormatterGenericSet.py
@@ -10,6 +10,8 @@ from lldbsuite.test import lldbutil
class GenericSetDataFormatterTestCase(TestBase):
+ TEST_WITH_PDB_DEBUG_INFO = True
+
def setUp(self):
TestBase.setUp(self)
self.namespace = "std"
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/TestDataFormatterStdTuple.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/TestDataFormatterStdTuple.py
index b23d549..8984387 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/TestDataFormatterStdTuple.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/TestDataFormatterStdTuple.py
@@ -9,6 +9,8 @@ from lldbsuite.test import lldbutil
class TestDataFormatterStdTuple(TestBase):
+ TEST_WITH_PDB_DEBUG_INFO = True
+
def setUp(self):
TestBase.setUp(self)
self.line = line_number("main.cpp", "// break here")
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py
index dd142d2..f74092c 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py
@@ -9,6 +9,8 @@ from lldbsuite.test import lldbutil
class StdVBoolDataFormatterTestCase(TestBase):
+ TEST_WITH_PDB_DEBUG_INFO = True
+
def setUp(self):
# Call super's setUp().
TestBase.setUp(self)
diff --git a/lldb/test/Shell/Commands/Inputs/sigchld.c b/lldb/test/Shell/Commands/Inputs/sigchld.c
index ba8c5ef..0121e70 100644
--- a/lldb/test/Shell/Commands/Inputs/sigchld.c
+++ b/lldb/test/Shell/Commands/Inputs/sigchld.c
@@ -1,3 +1,7 @@
+#if defined(__linux__)
+#define _XOPEN_SOURCE 500 /* for CLD_EXITED */
+#endif
+
#include <assert.h>
#include <signal.h>
#include <stdio.h>
diff --git a/lldb/test/Shell/Commands/command-list-reach-beginning-of-file.test b/lldb/test/Shell/Commands/command-list-reach-beginning-of-file.test
index fa4a93e..9987efe 100644
--- a/lldb/test/Shell/Commands/command-list-reach-beginning-of-file.test
+++ b/lldb/test/Shell/Commands/command-list-reach-beginning-of-file.test
@@ -4,7 +4,7 @@
# RUN: %lldb %t.out -b -s %s 2>&1 | FileCheck %s
list
-# CHECK: note: No source available
+# CHECK: note: No source available
b main
# CHECK: Breakpoint 1:
@@ -18,7 +18,7 @@ list
list -
# CHECK: int main()
-list -10
+list -13
# CHECK: #include <assert.h>
list -
diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
index 3afaaa2..8df3f29 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
+++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
@@ -2853,12 +2853,6 @@ pid_t MachProcess::AttachForDebug(
if (err.Success()) {
m_flags |= eMachProcessFlagsAttached;
- // Sleep a bit to let the exception get received and set our process
- // status
- // to stopped.
- ::usleep(250000);
- DNBLog("[LaunchAttach] (%d) Done napping after ptrace(PT_ATTACHEXC)'ing",
- getpid());
DNBLogThreadedIf(LOG_PROCESS, "successfully attached to pid %d", pid);
return m_pid;
} else {
diff --git a/lldb/unittests/Core/CMakeLists.txt b/lldb/unittests/Core/CMakeLists.txt
index 6e609a6..f0c9a9a 100644
--- a/lldb/unittests/Core/CMakeLists.txt
+++ b/lldb/unittests/Core/CMakeLists.txt
@@ -7,6 +7,7 @@ add_lldb_unittest(LLDBCoreTests
DumpRegisterInfoTest.cpp
FormatEntityTest.cpp
MangledTest.cpp
+ ModuleListTest.cpp
ModuleSpecTest.cpp
PluginManagerTest.cpp
ProgressReportTest.cpp
diff --git a/lldb/unittests/Core/ModuleListTest.cpp b/lldb/unittests/Core/ModuleListTest.cpp
new file mode 100644
index 0000000..3c70b0a
--- /dev/null
+++ b/lldb/unittests/Core/ModuleListTest.cpp
@@ -0,0 +1,178 @@
+//===-- ModuleListTest.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Core/ModuleList.h"
+#include "TestingSupport/SubsystemRAII.h"
+#include "TestingSupport/TestUtilities.h"
+#include "lldb/Core/Module.h"
+#include "lldb/Core/ModuleSpec.h"
+#include "lldb/Host/FileSystem.h"
+#include "lldb/Utility/ArchSpec.h"
+#include "lldb/Utility/UUID.h"
+
+#include "Plugins/ObjectFile/ELF/ObjectFileELF.h"
+
+#include "gtest/gtest.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+// Test that when we already have a module in the shared_module_list with a
+// specific UUID, the next call to GetSharedModule with a module_spec with the
+// same UUID should return the existing module instead of creating a new one.
+TEST(ModuleListTest, GetSharedModuleReusesExistingModuleWithSameUUID) {
+ SubsystemRAII<FileSystem, ObjectFileELF> subsystems;
+
+ auto ExpectedFile = TestFile::fromYaml(R"(
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_DYN
+ Machine: EM_X86_64
+Sections:
+ - Name: .text
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ AddressAlign: 0x0000000000000010
+...
+)");
+ ASSERT_THAT_EXPECTED(ExpectedFile, llvm::Succeeded());
+
+ // First, let's verify that calling GetSharedModule twice with the same
+ // module_spec returns the same module pointer
+
+ ModuleSP first_module;
+ bool first_did_create = false;
+ Status error_first =
+ ModuleList::GetSharedModule(ExpectedFile->moduleSpec(), first_module,
+ nullptr, &first_did_create, false);
+
+ // Second call with the same spec
+ ModuleSP second_module;
+ bool second_did_create = false;
+ Status error_second =
+ ModuleList::GetSharedModule(ExpectedFile->moduleSpec(), second_module,
+ nullptr, &second_did_create, false);
+
+ if (error_first.Success() && error_second.Success()) {
+ // If both succeeded, verify they're the same module
+ EXPECT_EQ(first_module.get(), second_module.get())
+ << "GetSharedModule should return the same module for the same spec";
+ EXPECT_TRUE(first_did_create) << "First call should create the module";
+ EXPECT_FALSE(second_did_create)
+ << "Second call should reuse the existing module";
+ }
+}
+
+// Test that UUID-based lookup finds existing modules
+TEST(ModuleListTest, FindSharedModuleByUUID) {
+ SubsystemRAII<FileSystem, ObjectFileELF> subsystems;
+
+ auto ExpectedFile = TestFile::fromYaml(R"(
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_DYN
+ Machine: EM_X86_64
+Sections:
+ - Name: .text
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ AddressAlign: 0x0000000000000010
+...
+)");
+ ASSERT_THAT_EXPECTED(ExpectedFile, llvm::Succeeded());
+
+ // Create and add a module to the shared module list using the moduleSpec()
+ ModuleSP created_module;
+ bool did_create = false;
+ Status error = ModuleList::GetSharedModule(
+ ExpectedFile->moduleSpec(), created_module, nullptr, &did_create, false);
+
+ if (error.Success() && created_module) {
+ // Get the UUID of the created module
+ UUID module_uuid = created_module->GetUUID();
+
+ if (module_uuid.IsValid()) {
+ // Now try to find the module by UUID
+ ModuleSP found_module = ModuleList::FindSharedModule(module_uuid);
+
+ ASSERT_NE(found_module.get(), nullptr)
+ << "FindSharedModule should find the module by UUID";
+ EXPECT_EQ(found_module.get(), created_module.get())
+ << "FindSharedModule should return the same module instance";
+ EXPECT_EQ(found_module->GetUUID(), module_uuid)
+ << "Found module should have the same UUID";
+ }
+ }
+}
+
+// Test that GetSharedModule with UUID finds existing module even with different
+// path
+TEST(ModuleListTest, GetSharedModuleByUUIDIgnoresPath) {
+ SubsystemRAII<FileSystem, ObjectFileELF> subsystems;
+
+ auto ExpectedFile = TestFile::fromYaml(R"(
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_DYN
+ Machine: EM_X86_64
+Sections:
+ - Name: .text
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ AddressAlign: 0x0000000000000010
+...
+)");
+ ASSERT_THAT_EXPECTED(ExpectedFile, llvm::Succeeded());
+
+ // Create and add a module to the shared module list
+ ModuleSP first_module;
+ bool first_did_create = false;
+ Status first_error =
+ ModuleList::GetSharedModule(ExpectedFile->moduleSpec(), first_module,
+ nullptr, &first_did_create, false);
+
+ if (first_error.Success() && first_module) {
+ UUID module_uuid = first_module->GetUUID();
+
+ if (module_uuid.IsValid()) {
+ // Now try to get a module with the same UUID but different path
+ ModuleSpec second_spec;
+ second_spec.GetFileSpec() = FileSpec("/different/path/to/module.so");
+ second_spec.GetArchitecture() = ArchSpec("x86_64-pc-linux");
+ second_spec.GetUUID() = module_uuid;
+
+ ModuleSP second_module;
+ bool second_did_create = false;
+ Status second_error = ModuleList::GetSharedModule(
+ second_spec, second_module, nullptr, &second_did_create, false);
+
+ if (second_error.Success() && second_module) {
+ // If we got a module back, check if it's the same one
+ bool is_same_module = (second_module.get() == first_module.get());
+
+ // Document the behavior: ideally UUID should take precedence
+ // and return the existing module
+ EXPECT_TRUE(is_same_module)
+ << "GetSharedModule with matching UUID should return existing "
+ "module, "
+ << "even with different path (per PR #160199)";
+
+ if (is_same_module) {
+ EXPECT_FALSE(second_did_create)
+ << "Should not create a new module when UUID matches";
+ }
+ }
+ }
+ }
+}
diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
index 3d0e2d8..a63b740 100644
--- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
+++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
@@ -161,6 +161,11 @@ void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBExecutionContext(
return nullptr;
}
+void *
+lldb_private::python::LLDBSWIGPython_CastPyObjectToSBFrameList(PyObject *data) {
+ return nullptr;
+}
+
lldb::ValueObjectSP
lldb_private::python::SWIGBridge::LLDBSWIGPython_GetValueObjectSPFromSBValue(
void *data) {
@@ -329,6 +334,11 @@ lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ProcessSP) {
return python::PythonObject();
}
+python::PythonObject
+lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::StackFrameListSP) {
+ return python::PythonObject();
+}
+
python::PythonObject lldb_private::python::SWIGBridge::ToSWIGWrapper(
const lldb_private::StructuredDataImpl &) {
return python::PythonObject();
diff --git a/lldb/unittests/Target/LocateModuleCallbackTest.cpp b/lldb/unittests/Target/LocateModuleCallbackTest.cpp
index 6ffa41b..d727cea 100644
--- a/lldb/unittests/Target/LocateModuleCallbackTest.cpp
+++ b/lldb/unittests/Target/LocateModuleCallbackTest.cpp
@@ -362,7 +362,7 @@ TEST_F(LocateModuleCallbackTest, GetOrCreateModuleCallbackFailureNoCache) {
});
m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
- ASSERT_EQ(callback_call_count, 2);
+ ASSERT_EQ(callback_call_count, 3);
ASSERT_FALSE(m_module_sp);
}
@@ -383,7 +383,7 @@ TEST_F(LocateModuleCallbackTest, GetOrCreateModuleCallbackFailureCached) {
});
m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
- ASSERT_EQ(callback_call_count, 2);
+ ASSERT_EQ(callback_call_count, 3);
CheckModule(m_module_sp);
ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view);
ASSERT_FALSE(m_module_sp->GetSymbolFileFileSpec());
@@ -409,7 +409,7 @@ TEST_F(LocateModuleCallbackTest, GetOrCreateModuleCallbackNoFiles) {
});
m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
- ASSERT_EQ(callback_call_count, 2);
+ ASSERT_EQ(callback_call_count, 3);
CheckModule(m_module_sp);
ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view);
ASSERT_FALSE(m_module_sp->GetSymbolFileFileSpec());
@@ -435,7 +435,7 @@ TEST_F(LocateModuleCallbackTest, GetOrCreateModuleCallbackNonExistentModule) {
});
m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
- ASSERT_EQ(callback_call_count, 2);
+ ASSERT_EQ(callback_call_count, 3);
CheckModule(m_module_sp);
ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view);
ASSERT_FALSE(m_module_sp->GetSymbolFileFileSpec());
@@ -464,7 +464,7 @@ TEST_F(LocateModuleCallbackTest, GetOrCreateModuleCallbackNonExistentSymbol) {
});
m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
- ASSERT_EQ(callback_call_count, 2);
+ ASSERT_EQ(callback_call_count, 3);
CheckModule(m_module_sp);
ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view);
ASSERT_TRUE(m_module_sp->GetSymbolFileFileSpec().GetPath().empty());
@@ -622,7 +622,7 @@ TEST_F(LocateModuleCallbackTest,
});
m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
- ASSERT_EQ(callback_call_count, 2);
+ ASSERT_EQ(callback_call_count, 3);
CheckModule(m_module_sp);
ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view);
ASSERT_EQ(m_module_sp->GetSymbolFileFileSpec(),
@@ -650,7 +650,7 @@ TEST_F(LocateModuleCallbackTest,
});
m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
- ASSERT_EQ(callback_call_count, 2);
+ ASSERT_EQ(callback_call_count, 3);
CheckModule(m_module_sp);
ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view);
ASSERT_EQ(m_module_sp->GetSymbolFileFileSpec(),
@@ -682,7 +682,7 @@ TEST_F(LocateModuleCallbackTest,
});
m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
- ASSERT_EQ(callback_call_count, 2);
+ ASSERT_EQ(callback_call_count, 3);
CheckModule(m_module_sp);
ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view);
ASSERT_EQ(m_module_sp->GetSymbolFileFileSpec(),
@@ -709,7 +709,7 @@ TEST_F(LocateModuleCallbackTest,
});
m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
- ASSERT_EQ(callback_call_count, 2);
+ ASSERT_EQ(callback_call_count, 3);
ASSERT_FALSE(m_module_sp);
}
@@ -731,7 +731,7 @@ TEST_F(LocateModuleCallbackTest,
});
m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
- ASSERT_EQ(callback_call_count, 2);
+ ASSERT_EQ(callback_call_count, 3);
ASSERT_FALSE(m_module_sp);
}
diff --git a/lldb/unittests/Target/RemoteAwarePlatformTest.cpp b/lldb/unittests/Target/RemoteAwarePlatformTest.cpp
index 3278674..cfcec69 100644
--- a/lldb/unittests/Target/RemoteAwarePlatformTest.cpp
+++ b/lldb/unittests/Target/RemoteAwarePlatformTest.cpp
@@ -32,15 +32,12 @@ public:
ProcessSP(ProcessAttachInfo &, Debugger &, Target *, Status &));
MOCK_METHOD0(CalculateTrapHandlerSymbolNames, void());
- MOCK_METHOD2(ResolveExecutable,
- std::pair<bool, ModuleSP>(const ModuleSpec &,
- const FileSpecList *));
- Status
- ResolveExecutable(const ModuleSpec &module_spec,
- lldb::ModuleSP &exe_module_sp,
- const FileSpecList *module_search_paths_ptr) /*override*/
+ MOCK_METHOD1(ResolveExecutable,
+ std::pair<bool, ModuleSP>(const ModuleSpec &));
+ Status ResolveExecutable(const ModuleSpec &module_spec,
+ lldb::ModuleSP &exe_module_sp) /*override*/
{ // NOLINT(modernize-use-override)
- auto pair = ResolveExecutable(module_spec, module_search_paths_ptr);
+ auto pair = ResolveExecutable(module_spec);
exe_module_sp = pair.second;
return pair.first ? Status() : Status::FromErrorString("error");
}
@@ -80,14 +77,14 @@ TEST_F(RemoteAwarePlatformTest, TestResolveExecutabelOnClientByPlatform) {
static const ArchSpec process_host_arch;
EXPECT_CALL(platform, GetSupportedArchitectures(process_host_arch))
.WillRepeatedly(Return(std::vector<ArchSpec>()));
- EXPECT_CALL(platform, ResolveExecutable(_, _))
+ EXPECT_CALL(platform, ResolveExecutable(_))
.WillRepeatedly(Return(std::make_pair(true, expected_executable)));
platform.SetRemotePlatform(std::make_shared<TargetPlatformTester>(false));
ModuleSP resolved_sp;
lldb_private::Status status =
- platform.ResolveExecutable(executable_spec, resolved_sp, nullptr);
+ platform.ResolveExecutable(executable_spec, resolved_sp);
ASSERT_TRUE(status.Success());
EXPECT_EQ(expected_executable.get(), resolved_sp.get());
diff --git a/llvm/docs/Extensions.rst b/llvm/docs/Extensions.rst
index 91a3ac0..4bff111 100644
--- a/llvm/docs/Extensions.rst
+++ b/llvm/docs/Extensions.rst
@@ -274,13 +274,13 @@ This would be equivalent to the following raw assembly:
The following directives are specified:
- - lib
+ - ``lib``
The parameter identifies a library to be linked against. The library will
be looked up in the default and any specified library search paths
(specified to this point).
- - libpath
+ - ``libpath``
The parameter identifies an additional library search path to be considered
when looking up libraries after the inclusion of this option.
@@ -327,13 +327,13 @@ The contents of the section shall be a sequence of ``Elf_CGProfile`` entries.
Elf_Xword cgp_weight;
} Elf_CGProfile;
-cgp_from
+``cgp_from``
The symbol index of the source of the edge.
-cgp_to
+``cgp_to``
The symbol index of the destination of the edge.
-cgp_weight
+``cgp_weight``
The weight of the edge.
This is represented in assembly as:
@@ -352,7 +352,7 @@ table.
``SHT_LLVM_ADDRSIG`` Section (address-significance table)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-This section is used to mark symbols as address-significant, i.e. the address
+This section is used to mark symbols as address-significant, i.e., the address
of the symbol is used in a comparison or leaks outside the translation unit. It
has the same meaning as the absence of the LLVM attributes ``unnamed_addr``
and ``local_unnamed_addr``.
@@ -519,11 +519,11 @@ those bits are:
#. Basic Block Frequencies - Encoded as raw block frequency value taken from
MBFI analysis. This value is an integer that encodes the relative frequency
compared to the entry block. More information can be found in
- 'llvm/Support/BlockFrequency.h'.
+ ``llvm/Support/BlockFrequency.h``.
#. Branch Probabilities - Encoded as raw numerator for branch probability
taken from MBPI analysis. This value is the numerator for a fixed point ratio
- defined in 'llvm/Support/BranchProbability.h'. It indicates the probability
+ defined in ``llvm/Support/BranchProbability.h``. It indicates the probability
that the block is followed by a given successor block during execution.
This extra data requires version 2 or above. This is necessary since successors
@@ -726,7 +726,7 @@ Syntax:
Syntax:
``.cv_fpo_data`` *procsym*
-Target Specific Behaviour
+Target-Specific Behaviour
=========================
X86
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index b9507a2..bd0337f 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -30968,6 +30968,37 @@ This intrinsic does nothing, but optimizers must consider it a use of its single
operand and should try to preserve the intrinsic and its position in the
function.
+.. _llvm_reloc_none:
+
+'``llvm.reloc.none``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+ declare void @llvm.reloc.none(metadata !<name_str>)
+
+Overview:
+"""""""""
+
+The ``llvm.reloc.none`` intrinsic emits a no-op relocation against a given
+operand symbol. This can bring the symbol definition into the link without
+emitting any code or data to the binary for that purpose.
+
+Arguments:
+""""""""""
+
+The ``llvm.reloc.none`` intrinsic takes the symbol as a metadata string
+argument.
+
+Semantics:
+""""""""""
+
+This intrinsic emits a no-op relocation for the symbol at the location of the
+intrinsic call.
+
Stack Map Intrinsics
--------------------
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index bfe6827..23bba99 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -67,6 +67,9 @@ Changes to the LLVM IR
Instead, the `align` attribute should be placed on the pointer (or vector of
pointers) argument.
* A `load atomic` may now be used with vector types on x86.
+* Added `@llvm.reloc.none` intrinsic to emit null relocations to symbols. This
+ emits an undefined symbol reference without adding any dedicated code or data to
+ to bear the relocation.
Changes to LLVM infrastructure
------------------------------
diff --git a/llvm/include/llvm/Analysis/RegionPrinter.h b/llvm/include/llvm/Analysis/RegionPrinter.h
index 3a1d11d..1d4ba0f 100644
--- a/llvm/include/llvm/Analysis/RegionPrinter.h
+++ b/llvm/include/llvm/Analysis/RegionPrinter.h
@@ -18,64 +18,64 @@
#include "llvm/Support/DOTGraphTraits.h"
namespace llvm {
- class FunctionPass;
- class Function;
- class RegionInfo;
- class RegionNode;
+class FunctionPass;
+class Function;
+class RegionInfo;
+class RegionNode;
- LLVM_ABI FunctionPass *createRegionViewerPass();
- LLVM_ABI FunctionPass *createRegionOnlyViewerPass();
- LLVM_ABI FunctionPass *createRegionPrinterPass();
- LLVM_ABI FunctionPass *createRegionOnlyPrinterPass();
+LLVM_ABI FunctionPass *createRegionViewerPass();
+LLVM_ABI FunctionPass *createRegionOnlyViewerPass();
+LLVM_ABI FunctionPass *createRegionPrinterPass();
+LLVM_ABI FunctionPass *createRegionOnlyPrinterPass();
- template <>
- struct DOTGraphTraits<RegionNode *> : public DefaultDOTGraphTraits {
- DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+template <> struct DOTGraphTraits<RegionNode *> : public DefaultDOTGraphTraits {
+ DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
- LLVM_ABI std::string getNodeLabel(RegionNode *Node, RegionNode *Graph);
- };
+ LLVM_ABI std::string getNodeLabel(RegionNode *Node, RegionNode *Graph);
+};
#ifndef NDEBUG
- /// Open a viewer to display the GraphViz vizualization of the analysis
- /// result.
- ///
- /// Practical to call in the debugger.
- /// Includes the instructions in each BasicBlock.
- ///
- /// @param RI The analysis to display.
- void viewRegion(llvm::RegionInfo *RI);
+/// Open a viewer to display the GraphViz vizualization of the analysis
+/// result.
+///
+/// Practical to call in the debugger.
+/// Includes the instructions in each BasicBlock.
+///
+/// @param RI The analysis to display.
+void viewRegion(llvm::RegionInfo *RI);
- /// Analyze the regions of a function and open its GraphViz
- /// visualization in a viewer.
- ///
- /// Useful to call in the debugger.
- /// Includes the instructions in each BasicBlock.
- /// The result of a new analysis may differ from the RegionInfo the pass
- /// manager currently holds.
- ///
- /// @param F Function to analyze.
- void viewRegion(const llvm::Function *F);
+/// Analyze the regions of a function and open its GraphViz
+/// visualization in a viewer.
+///
+/// Useful to call in the debugger.
+/// Includes the instructions in each BasicBlock.
+/// The result of a new analysis may differ from the RegionInfo the pass
+/// manager currently holds.
+///
+/// @param F Function to analyze.
+void viewRegion(const llvm::Function *F);
- /// Open a viewer to display the GraphViz vizualization of the analysis
- /// result.
- ///
- /// Useful to call in the debugger.
- /// Shows only the BasicBlock names without their instructions.
- ///
- /// @param RI The analysis to display.
- void viewRegionOnly(llvm::RegionInfo *RI);
+/// Open a viewer to display the GraphViz vizualization of the analysis
+/// result.
+///
+/// Useful to call in the debugger.
+/// Shows only the BasicBlock names without their instructions.
+///
+/// @param RI The analysis to display.
+void viewRegionOnly(llvm::RegionInfo *RI);
- /// Analyze the regions of a function and open its GraphViz
- /// visualization in a viewer.
- ///
- /// Useful to call in the debugger.
- /// Shows only the BasicBlock names without their instructions.
- /// The result of a new analysis may differ from the RegionInfo the pass
- /// manager currently holds.
- ///
- /// @param F Function to analyze.
- void viewRegionOnly(const llvm::Function *F);
-#endif
-} // End llvm namespace
+/// Analyze the regions of a function and open its GraphViz
+/// visualization in a viewer.
+///
+/// Useful to call in the debugger.
+/// Shows only the BasicBlock names without their instructions.
+/// The result of a new analysis may differ from the RegionInfo the pass
+/// manager currently holds.
+///
+/// @param F Function to analyze.
+void viewRegionOnly(const llvm::Function *F);
+#endif // NDEBUG
-#endif
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_REGIONPRINTER_H
diff --git a/llvm/include/llvm/CodeGen/Analysis.h b/llvm/include/llvm/CodeGen/Analysis.h
index 98b5257..2f1364d 100644
--- a/llvm/include/llvm/CodeGen/Analysis.h
+++ b/llvm/include/llvm/CodeGen/Analysis.h
@@ -71,7 +71,7 @@ void ComputeValueTypes(const DataLayout &DL, Type *Ty,
///
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty,
SmallVectorImpl<EVT> &ValueVTs,
- SmallVectorImpl<EVT> *MemVTs,
+ SmallVectorImpl<EVT> *MemVTs = nullptr,
SmallVectorImpl<TypeSize> *Offsets = nullptr,
TypeSize StartingOffset = TypeSize::getZero());
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty,
@@ -80,20 +80,6 @@ void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty,
SmallVectorImpl<uint64_t> *FixedOffsets,
uint64_t StartingOffset);
-/// Variant of ComputeValueVTs that don't produce memory VTs.
-inline void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
- Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
- SmallVectorImpl<TypeSize> *Offsets = nullptr,
- TypeSize StartingOffset = TypeSize::getZero()) {
- ComputeValueVTs(TLI, DL, Ty, ValueVTs, nullptr, Offsets, StartingOffset);
-}
-inline void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
- Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
- SmallVectorImpl<uint64_t> *FixedOffsets,
- uint64_t StartingOffset) {
- ComputeValueVTs(TLI, DL, Ty, ValueVTs, nullptr, FixedOffsets, StartingOffset);
-}
-
/// computeValueLLTs - Given an LLVM IR type, compute a sequence of
/// LLTs that represent all the individual underlying
/// non-aggregate types that comprise it.
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index ff3dd0d..af60c04 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1537,6 +1537,9 @@ enum NodeType {
#define BEGIN_REGISTER_VP_SDNODE(VPSDID, ...) VPSDID,
#include "llvm/IR/VPIntrinsics.def"
+ // Issue a no-op relocation against a given symbol at the current location.
+ RELOC_NONE,
+
// The `llvm.experimental.convergence.*` intrinsics.
CONVERGENCECTRL_ANCHOR,
CONVERGENCECTRL_ENTRY,
diff --git a/llvm/include/llvm/CodeGen/LibcallLoweringInfo.h b/llvm/include/llvm/CodeGen/LibcallLoweringInfo.h
index e8eceee..e88079e 100644
--- a/llvm/include/llvm/CodeGen/LibcallLoweringInfo.h
+++ b/llvm/include/llvm/CodeGen/LibcallLoweringInfo.h
@@ -6,15 +6,18 @@
//
//===----------------------------------------------------------------------===//
+#ifndef LLVM_CODEGEN_LIBCALLLOWERINGINFO_H
+#define LLVM_CODEGEN_LIBCALLLOWERINGINFO_H
+
#include "llvm/IR/RuntimeLibcalls.h"
namespace llvm {
class LibcallLoweringInfo {
private:
- LLVM_ABI const RTLIB::RuntimeLibcallsInfo &RTLCI;
+ const RTLIB::RuntimeLibcallsInfo &RTLCI;
/// Stores the implementation choice for each each libcall.
- LLVM_ABI RTLIB::LibcallImpl LibcallImpls[RTLIB::UNKNOWN_LIBCALL + 1] = {
+ RTLIB::LibcallImpl LibcallImpls[RTLIB::UNKNOWN_LIBCALL + 1] = {
RTLIB::Unsupported};
public:
@@ -64,3 +67,5 @@ public:
};
} // end namespace llvm
+
+#endif // LLVM_CODEGEN_LIBCALLLOWERINGINFO_H
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
index d7921c3..27acc83 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
@@ -474,6 +474,7 @@ private:
void Select_WRITE_REGISTER(SDNode *Op);
void Select_UNDEF(SDNode *N);
void Select_FAKE_USE(SDNode *N);
+ void Select_RELOC_NONE(SDNode *N);
void CannotYetSelect(SDNode *N);
void Select_FREEZE(SDNode *N);
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 6a079f6..520130c 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1913,6 +1913,9 @@ def int_threadlocal_address : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [LLVMMatch
def int_stepvector : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[], [IntrNoMem]>;
+def int_reloc_none : DefaultAttrsIntrinsic<[], [llvm_metadata_ty],
+ [IntrNoMem, IntrHasSideEffects]>;
+
//===---------------- Vector Predication Intrinsics --------------===//
// Memory Intrinsics
def int_vp_store : DefaultAttrsIntrinsic<[],
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index e3ec7e1..bb3d7ff 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -3069,12 +3069,26 @@ m_c_MaxOrMin(const LHS &L, const RHS &R) {
m_CombineOr(m_c_UMax(L, R), m_c_UMin(L, R)));
}
+template <Intrinsic::ID IntrID, typename LHS, typename RHS>
+struct CommutativeBinaryIntrinsic_match {
+ LHS L;
+ RHS R;
+
+ CommutativeBinaryIntrinsic_match(const LHS &L, const RHS &R) : L(L), R(R) {}
+
+ template <typename OpTy> bool match(OpTy *V) const {
+ const auto *II = dyn_cast<IntrinsicInst>(V);
+ if (!II || II->getIntrinsicID() != IntrID)
+ return false;
+ return (L.match(II->getArgOperand(0)) && R.match(II->getArgOperand(1))) ||
+ (L.match(II->getArgOperand(1)) && R.match(II->getArgOperand(0)));
+ }
+};
+
template <Intrinsic::ID IntrID, typename T0, typename T1>
-inline match_combine_or<typename m_Intrinsic_Ty<T0, T1>::Ty,
- typename m_Intrinsic_Ty<T1, T0>::Ty>
+inline CommutativeBinaryIntrinsic_match<IntrID, T0, T1>
m_c_Intrinsic(const T0 &Op0, const T1 &Op1) {
- return m_CombineOr(m_Intrinsic<IntrID>(Op0, Op1),
- m_Intrinsic<IntrID>(Op1, Op0));
+ return CommutativeBinaryIntrinsic_match<IntrID, T0, T1>(Op0, Op1);
}
/// Matches FAdd with LHS and RHS in either order.
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 85a9efe..7886478 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -1058,8 +1058,10 @@ struct NamedInstrProfRecord : InstrProfRecord {
StringRef Name;
uint64_t Hash;
- // We reserve this bit as the flag for context sensitive profile record.
- static const int CS_FLAG_IN_FUNC_HASH = 60;
+ // We reserve the highest 4 bits as flags.
+ static constexpr uint64_t FUNC_HASH_MASK = 0x0FFF'FFFF'FFFF'FFFF;
+ // The 60th bit is for context sensitive profile record.
+ static constexpr unsigned CS_FLAG_IN_FUNC_HASH = 60;
NamedInstrProfRecord() = default;
NamedInstrProfRecord(StringRef Name, uint64_t Hash,
@@ -1174,7 +1176,9 @@ enum ProfVersion {
Version11 = 11,
// VTable profiling, decision record and bitmap are modified for mcdc.
Version12 = 12,
- // The current version is 12.
+ // In this version, the frontend PGO stable hash algorithm defaults to V4.
+ Version13 = 13,
+ // The current version is 13.
CurrentVersion = INSTR_PROF_INDEX_VERSION
};
const uint64_t Version = ProfVersion::CurrentVersion;
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index 0496f24..46d6bb5 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -722,7 +722,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
/* Raw profile format version (start from 1). */
#define INSTR_PROF_RAW_VERSION 10
/* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 12
+#define INSTR_PROF_INDEX_VERSION 13
/* Coverage mapping format version (start from 0). */
#define INSTR_PROF_COVMAP_VERSION 6
diff --git a/llvm/include/llvm/Support/Casting.h b/llvm/include/llvm/Support/Casting.h
index a6435a2..af283e2 100644
--- a/llvm/include/llvm/Support/Casting.h
+++ b/llvm/include/llvm/Support/Casting.h
@@ -878,18 +878,18 @@ inline constexpr detail::IsaAndPresentCheckPredicate<Types...>
IsaAndPresentPred{};
/// Function objects corresponding to the Cast types defined above.
-template <typename From>
-inline constexpr detail::StaticCastFunc<From> StaticCastTo{};
+template <typename To>
+inline constexpr detail::StaticCastFunc<To> StaticCastTo{};
-template <typename From> inline constexpr detail::CastFunc<From> CastTo{};
+template <typename To> inline constexpr detail::CastFunc<To> CastTo{};
-template <typename From>
-inline constexpr detail::CastIfPresentFunc<From> CastIfPresentTo{};
+template <typename To>
+inline constexpr detail::CastIfPresentFunc<To> CastIfPresentTo{};
-template <typename From>
-inline constexpr detail::DynCastIfPresentFunc<From> DynCastIfPresentTo{};
+template <typename To>
+inline constexpr detail::DynCastIfPresentFunc<To> DynCastIfPresentTo{};
-template <typename From> inline constexpr detail::DynCastFunc<From> DynCastTo{};
+template <typename To> inline constexpr detail::DynCastFunc<To> DynCastTo{};
} // end namespace llvm
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index e5531456..fb20da3 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -233,6 +233,9 @@ HANDLE_TARGET_OPCODE(MEMBARRIER)
// using.
HANDLE_TARGET_OPCODE(JUMP_TABLE_DEBUG_INFO)
+// Issue a no-op relocation against a given symbol at the current location.
+HANDLE_TARGET_OPCODE(RELOC_NONE)
+
HANDLE_TARGET_OPCODE(CONVERGENCECTRL_ENTRY)
HANDLE_TARGET_OPCODE(CONVERGENCECTRL_ANCHOR)
HANDLE_TARGET_OPCODE(CONVERGENCECTRL_LOOP)
diff --git a/llvm/include/llvm/Support/thread.h b/llvm/include/llvm/Support/thread.h
index ecde62d..51873e7 100644
--- a/llvm/include/llvm/Support/thread.h
+++ b/llvm/include/llvm/Support/thread.h
@@ -34,7 +34,7 @@ typedef PVOID HANDLE;
namespace llvm {
-#if LLVM_ON_UNIX || _WIN32
+#if defined(LLVM_ON_UNIX) || defined(_WIN32)
/// LLVM thread following std::thread interface with added constructor to
/// specify stack size.
@@ -49,7 +49,7 @@ class thread {
}
public:
-#if LLVM_ON_UNIX
+#ifdef LLVM_ON_UNIX
using native_handle_type = pthread_t;
using id = pthread_t;
using start_routine_type = void *(*)(void *);
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index 1317517..db99885 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -1554,6 +1554,11 @@ def JUMP_TABLE_DEBUG_INFO : StandardPseudoInstruction {
let Size = 0;
let isMeta = true;
}
+def RELOC_NONE : StandardPseudoInstruction {
+ let OutOperandList = (outs);
+ let InOperandList = (ins unknown:$symbol);
+ let hasSideEffects = true;
+}
let hasSideEffects = false, isMeta = true, isConvergent = true in {
def CONVERGENCECTRL_ANCHOR : StandardPseudoInstruction {
diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h
index aad9859..9dfa50c1 100644
--- a/llvm/include/llvm/TargetParser/TargetParser.h
+++ b/llvm/include/llvm/TargetParser/TargetParser.h
@@ -161,6 +161,9 @@ enum ArchFeatureKind : uint32_t {
// WGP mode is supported.
FEATURE_WGP = 1 << 9,
+
+ // Xnack is available by default
+ FEATURE_XNACK_ALWAYS = 1 << 10
};
enum FeatureError : uint32_t {
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 11d8294..e45d1f7 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -1587,6 +1587,15 @@ static const SCEV *minusSCEVNoSignedOverflow(const SCEV *A, const SCEV *B,
return nullptr;
}
+/// Returns \p A * \p B if it guaranteed not to signed wrap. Otherwise returns
+/// nullptr. \p A and \p B must have the same integer type.
+static const SCEV *mulSCEVNoSignedOverflow(const SCEV *A, const SCEV *B,
+ ScalarEvolution &SE) {
+ if (SE.willNotOverflow(Instruction::Mul, /*Signed=*/true, A, B))
+ return SE.getMulExpr(A, B);
+ return nullptr;
+}
+
/// Returns the absolute value of \p A. In the context of dependence analysis,
/// we need an absolute value in a mathematical sense. If \p A is the signed
/// minimum value, we cannot represent it unless extending the original type.
@@ -1686,7 +1695,11 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
assert(0 < Level && Level <= CommonLevels && "level out of range");
Level--;
- const SCEV *Delta = SE->getMinusSCEV(SrcConst, DstConst);
+ const SCEV *Delta = minusSCEVNoSignedOverflow(SrcConst, DstConst, *SE);
+ if (!Delta) {
+ Result.Consistent = false;
+ return false;
+ }
LLVM_DEBUG(dbgs() << "\t Delta = " << *Delta);
LLVM_DEBUG(dbgs() << ", " << *Delta->getType() << "\n");
@@ -1702,7 +1715,9 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
const SCEV *AbsCoeff = absSCEVNoSignedOverflow(Coeff, *SE);
if (!AbsDelta || !AbsCoeff)
return false;
- const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff);
+ const SCEV *Product = mulSCEVNoSignedOverflow(UpperBound, AbsCoeff, *SE);
+ if (!Product)
+ return false;
return isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product);
}();
if (IsDeltaLarge) {
diff --git a/llvm/lib/Analysis/RegionPrinter.cpp b/llvm/lib/Analysis/RegionPrinter.cpp
index a83af4e..33e073b 100644
--- a/llvm/lib/Analysis/RegionPrinter.cpp
+++ b/llvm/lib/Analysis/RegionPrinter.cpp
@@ -29,10 +29,9 @@ onlySimpleRegions("only-simple-regions",
cl::Hidden,
cl::init(false));
-namespace llvm {
-
-std::string DOTGraphTraits<RegionNode *>::getNodeLabel(RegionNode *Node,
- RegionNode *Graph) {
+std::string
+llvm::DOTGraphTraits<RegionNode *>::getNodeLabel(RegionNode *Node,
+ RegionNode *Graph) {
if (!Node->isSubRegion()) {
BasicBlock *BB = Node->getNodeAs<BasicBlock>();
@@ -46,7 +45,8 @@ std::string DOTGraphTraits<RegionNode *>::getNodeLabel(RegionNode *Node,
}
template <>
-struct DOTGraphTraits<RegionInfo *> : public DOTGraphTraits<RegionNode *> {
+struct llvm::DOTGraphTraits<RegionInfo *>
+ : public llvm::DOTGraphTraits<RegionNode *> {
DOTGraphTraits (bool isSimple = false)
: DOTGraphTraits<RegionNode*>(isSimple) {}
@@ -125,7 +125,6 @@ struct DOTGraphTraits<RegionInfo *> : public DOTGraphTraits<RegionNode *> {
printRegionCluster(*G->getTopLevelRegion(), GW, 4);
}
};
-} // end namespace llvm
namespace {
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 713277d..3aa245b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2087,6 +2087,17 @@ void AsmPrinter::emitFunctionBody() {
// This is only used to influence register allocation behavior, no
// actual initialization is needed.
break;
+ case TargetOpcode::RELOC_NONE: {
+ // Generate a temporary label for the current PC.
+ MCSymbol *Sym = OutContext.createTempSymbol("reloc_none");
+ OutStreamer->emitLabel(Sym);
+ const MCExpr *Dot = MCSymbolRefExpr::create(Sym, OutContext);
+ const MCExpr *Value = MCSymbolRefExpr::create(
+ OutContext.getOrCreateSymbol(MI.getOperand(0).getSymbolName()),
+ OutContext);
+ OutStreamer->emitRelocDirective(*Dot, "BFD_RELOC_NONE", Value, SMLoc());
+ break;
+ }
default:
emitInstruction(&MI);
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index b3c3125..7be7468 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -292,7 +292,8 @@ void CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
LLVMContext &Ctx = OrigArg.Ty->getContext();
SmallVector<EVT, 4> SplitVTs;
- ComputeValueVTs(*TLI, DL, OrigArg.Ty, SplitVTs, Offsets, 0);
+ ComputeValueVTs(*TLI, DL, OrigArg.Ty, SplitVTs, /*MemVTs=*/nullptr, Offsets,
+ 0);
if (SplitVTs.size() == 0)
return;
@@ -996,7 +997,7 @@ void CallLowering::insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy,
SmallVector<EVT, 4> SplitVTs;
SmallVector<uint64_t, 4> Offsets;
- ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, &Offsets, 0);
+ ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, /*MemVTs=*/nullptr, &Offsets, 0);
assert(VRegs.size() == SplitVTs.size());
@@ -1028,7 +1029,7 @@ void CallLowering::insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy,
SmallVector<EVT, 4> SplitVTs;
SmallVector<uint64_t, 4> Offsets;
- ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, &Offsets, 0);
+ ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, /*MemVTs=*/nullptr, &Offsets, 0);
assert(VRegs.size() == SplitVTs.size());
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index be1b51f..4f6a19f 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2686,6 +2686,13 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
case Intrinsic::experimental_convergence_entry:
case Intrinsic::experimental_convergence_loop:
return translateConvergenceControlIntrinsic(CI, ID, MIRBuilder);
+ case Intrinsic::reloc_none: {
+ Metadata *MD = cast<MetadataAsValue>(CI.getArgOperand(0))->getMetadata();
+ StringRef SymbolName = cast<MDString>(MD)->getString();
+ MIRBuilder.buildInstr(TargetOpcode::RELOC_NONE)
+ .addExternalSymbol(SymbolName.data());
+ return true;
+ }
}
return false;
}
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 4795d81..434a579 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -1161,6 +1161,8 @@ bool MIParser::parse(MachineInstr *&MI) {
MemOperands.push_back(MemOp);
if (Token.isNewlineOrEOF())
break;
+ if (OpCode == TargetOpcode::BUNDLE && Token.is(MIToken::lbrace))
+ break;
if (Token.isNot(MIToken::comma))
return error("expected ',' before the next machine memory operand");
lex();
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 8ad9245..37e5c51 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1547,10 +1547,14 @@ bool MachineInstr::mayAlias(BatchAAResults *AA, const MachineInstr &Other,
// Check each pair of memory operands from both instructions, which can't
// alias only if all pairs won't alias.
- for (auto *MMOa : memoperands())
- for (auto *MMOb : Other.memoperands())
+ for (auto *MMOa : memoperands()) {
+ for (auto *MMOb : Other.memoperands()) {
+ if (!MMOa->isStore() && !MMOb->isStore())
+ continue;
if (MemOperandsHaveAlias(MFI, AA, UseTBAA, MMOa, MMOb))
return true;
+ }
+ }
return false;
}
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index f4c1a8b..fa654f2 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -143,6 +143,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
SmallSet<Register, 8> KilledUseSet;
SmallSet<Register, 8> UndefUseSet;
SmallVector<std::pair<Register, Register>> TiedOperands;
+ SmallVector<MachineInstr *> MemMIs;
for (auto MII = FirstMI; MII != LastMI; ++MII) {
// Debug instructions have no effects to track.
if (MII->isDebugInstr())
@@ -206,6 +207,9 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
MIB.setMIFlag(MachineInstr::FrameSetup);
if (MII->getFlag(MachineInstr::FrameDestroy))
MIB.setMIFlag(MachineInstr::FrameDestroy);
+
+ if (MII->mayLoadOrStore())
+ MemMIs.push_back(&*MII);
}
for (Register Reg : LocalDefs) {
@@ -231,6 +235,8 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
assert(UseIdx < ExternUses.size());
MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx);
}
+
+ MIB->cloneMergedMemRefs(MF, MemMIs);
}
/// finalizeBundle - Same functionality as the previous finalizeBundle except
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index e9ffa85..6b747f3 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -196,8 +196,6 @@ public:
bool run();
};
-constexpr Align SafeStack::StackAlignment;
-
uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType());
if (AI->isArrayAllocation()) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 9961c98..2f598b2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4758,7 +4758,7 @@ void SelectionDAGBuilder::visitStoreToSwiftError(const StoreInst &I) {
SmallVector<uint64_t, 4> Offsets;
const Value *SrcV = I.getOperand(0);
ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
- SrcV->getType(), ValueVTs, &Offsets, 0);
+ SrcV->getType(), ValueVTs, /*MemVTs=*/nullptr, &Offsets, 0);
assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
"expect a single EVT for swifterror");
@@ -4794,7 +4794,7 @@ void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
SmallVector<EVT, 4> ValueVTs;
SmallVector<uint64_t, 4> Offsets;
ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), Ty,
- ValueVTs, &Offsets, 0);
+ ValueVTs, /*MemVTs=*/nullptr, &Offsets, 0);
assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
"expect a single EVT for swifterror");
@@ -7811,6 +7811,17 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
return;
}
+ case Intrinsic::reloc_none: {
+ Metadata *MD = cast<MetadataAsValue>(I.getArgOperand(0))->getMetadata();
+ StringRef SymbolName = cast<MDString>(MD)->getString();
+ SDValue Ops[2] = {
+ getRoot(),
+ DAG.getTargetExternalSymbol(
+ SymbolName.data(), TLI.getProgramPointerTy(DAG.getDataLayout()))};
+ DAG.setRoot(DAG.getNode(ISD::RELOC_NONE, sdl, MVT::Other, Ops));
+ return;
+ }
+
case Intrinsic::eh_exceptionpointer:
case Intrinsic::eh_exceptioncode: {
// Get the exception pointer vreg, copy from it, and resize it to fit.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 77377d3..d3e1628 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -472,6 +472,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::LIFETIME_END: return "lifetime.end";
case ISD::FAKE_USE:
return "fake_use";
+ case ISD::RELOC_NONE:
+ return "reloc_none";
case ISD::PSEUDO_PROBE:
return "pseudoprobe";
case ISD::GC_TRANSITION_START: return "gc_transition.start";
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 6c11c5b..8bc5d2f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -2550,6 +2550,11 @@ void SelectionDAGISel::Select_FAKE_USE(SDNode *N) {
N->getOperand(1), N->getOperand(0));
}
+void SelectionDAGISel::Select_RELOC_NONE(SDNode *N) {
+ CurDAG->SelectNodeTo(N, TargetOpcode::RELOC_NONE, N->getValueType(0),
+ N->getOperand(1), N->getOperand(0));
+}
+
void SelectionDAGISel::Select_FREEZE(SDNode *N) {
// TODO: We don't have FREEZE pseudo-instruction in MachineInstr-level now.
// If FREEZE instruction is added later, the code below must be changed as
@@ -3325,6 +3330,9 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
case ISD::FAKE_USE:
Select_FAKE_USE(NodeToMatch);
return;
+ case ISD::RELOC_NONE:
+ Select_RELOC_NONE(NodeToMatch);
+ return;
case ISD::FREEZE:
Select_FREEZE(NodeToMatch);
return;
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 414e414..b99e1c7 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1665,6 +1665,17 @@ void TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
// by SubRegB is compatible with RegA with no subregister. So regardless of
// whether the dest oper writes a subreg, the source oper should not.
MO.setSubReg(0);
+
+ // Update uses of RegB to uses of RegA inside the bundle.
+ if (MI->isBundle()) {
+ for (MachineOperand &MO : mi_bundle_ops(*MI)) {
+ if (MO.isReg() && MO.getReg() == RegB) {
+ assert(MO.getSubReg() == 0 && SubRegB == 0 &&
+ "tied subregister uses in bundled instructions not supported");
+ MO.setReg(RegA);
+ }
+ }
+ }
}
if (AllUsesCopied) {
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index fafc325..a98e925 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -962,16 +962,29 @@ DIType *DIDerivedType::getClassType() const {
assert(getTag() == dwarf::DW_TAG_ptr_to_member_type);
return cast_or_null<DIType>(getExtraData());
}
+
+// Helper function to extract ConstantAsMetadata from ExtraData,
+// handling extra data MDTuple unwrapping if needed.
+static ConstantAsMetadata *extractConstantMetadata(Metadata *ExtraData) {
+ Metadata *ED = ExtraData;
+ if (auto *Tuple = dyn_cast_or_null<MDTuple>(ED)) {
+ if (Tuple->getNumOperands() != 1)
+ return nullptr;
+ ED = Tuple->getOperand(0);
+ }
+ return cast_or_null<ConstantAsMetadata>(ED);
+}
+
uint32_t DIDerivedType::getVBPtrOffset() const {
assert(getTag() == dwarf::DW_TAG_inheritance);
- if (auto *CM = cast_or_null<ConstantAsMetadata>(getExtraData()))
+ if (auto *CM = extractConstantMetadata(getExtraData()))
if (auto *CI = dyn_cast_or_null<ConstantInt>(CM->getValue()))
return static_cast<uint32_t>(CI->getZExtValue());
return 0;
}
Constant *DIDerivedType::getStorageOffsetInBits() const {
assert(getTag() == dwarf::DW_TAG_member && isBitField());
- if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
+ if (auto *C = extractConstantMetadata(getExtraData()))
return C->getValue();
return nullptr;
}
@@ -980,13 +993,13 @@ Constant *DIDerivedType::getConstant() const {
assert((getTag() == dwarf::DW_TAG_member ||
getTag() == dwarf::DW_TAG_variable) &&
isStaticMember());
- if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
+ if (auto *C = extractConstantMetadata(getExtraData()))
return C->getValue();
return nullptr;
}
Constant *DIDerivedType::getDiscriminantValue() const {
assert(getTag() == dwarf::DW_TAG_member && !isStaticMember());
- if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
+ if (auto *C = extractConstantMetadata(getExtraData()))
return C->getValue();
return nullptr;
}
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 24f90bf..f1e473a 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6013,6 +6013,12 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
Check(cast<ConstantInt>(Call.getArgOperand(3))->getZExtValue() < 2,
"cache type argument to llvm.prefetch must be 0-1", Call);
break;
+ case Intrinsic::reloc_none: {
+ Check(isa<MDString>(
+ cast<MetadataAsValue>(Call.getArgOperand(0))->getMetadata()),
+ "llvm.reloc.none argument must be a metadata string", &Call);
+ break;
+ }
case Intrinsic::stackprotector:
Check(isa<AllocaInst>(Call.getArgOperand(1)->stripPointerCasts()),
"llvm.stackprotector parameter #2 must resolve to an alloca.", Call);
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 0208735..5498787 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1690,7 +1690,7 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
IndexedInstrProf::ProfVersion::CurrentVersion)
return make_error<InstrProfError>(instrprof_error::unsupported_version);
- static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version12,
+ static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version13,
"Please update the reader as needed when a new field is added "
"or when indexed profile version gets bumped.");
@@ -1723,10 +1723,11 @@ size_t Header::size() const {
// of the header, and byte offset of existing fields shouldn't change when
// indexed profile version gets incremented.
static_assert(
- IndexedInstrProf::ProfVersion::CurrentVersion == Version12,
+ IndexedInstrProf::ProfVersion::CurrentVersion == Version13,
"Please update the size computation below if a new field has "
"been added to the header; for a version bump without new "
"fields, add a case statement to fall through to the latest version.");
+ case 13ull:
case 12ull:
return 72;
case 11ull:
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index a347351..0f15ca8 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -542,7 +542,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
// The WritePrevVersion handling will either need to be removed or updated
// if the version is advanced beyond 12.
static_assert(IndexedInstrProf::ProfVersion::CurrentVersion ==
- IndexedInstrProf::ProfVersion::Version12);
+ IndexedInstrProf::ProfVersion::Version13);
if (static_cast<bool>(ProfileKind & InstrProfKind::IRInstrumentation))
Header.Version |= VARIANT_MASK_IR_PROF;
if (static_cast<bool>(ProfileKind & InstrProfKind::ContextSensitive))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 1b559a6..f5081a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1248,7 +1248,8 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
SmallVector<EVT, 16> ValueVTs;
SmallVector<uint64_t, 16> Offsets;
- ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
+ ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
+ &Offsets, ArgOffset);
for (unsigned Value = 0, NumValues = ValueVTs.size();
Value != NumValues; ++Value) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9460145..6ce18ea 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3917,6 +3917,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
if (isLDSDMA(MIa) || isLDSDMA(MIb))
return false;
+ if (MIa.isBundle() || MIb.isBundle())
+ return false;
+
// TODO: Should we check the address space from the MachineMemOperand? That
// would allow us to distinguish objects we know don't alias based on the
// underlying address space, even if it was lowered to a different one,
diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
index 8c7bc2f..81303fa 100644
--- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -97,7 +97,6 @@
#define DEBUG_TYPE "bpf-abstract-member-access"
namespace llvm {
-constexpr StringRef BPFCoreSharedInfo::AmaAttr;
uint32_t BPFCoreSharedInfo::SeqNum;
Instruction *BPFCoreSharedInfo::insertPassThrough(Module *M, BasicBlock *BB,
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
index 6e5520c..3c61216 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -803,26 +803,6 @@ SDValue BPFTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
return getAddr(N, DAG);
}
-const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
- switch ((BPFISD::NodeType)Opcode) {
- case BPFISD::FIRST_NUMBER:
- break;
- case BPFISD::RET_GLUE:
- return "BPFISD::RET_GLUE";
- case BPFISD::CALL:
- return "BPFISD::CALL";
- case BPFISD::SELECT_CC:
- return "BPFISD::SELECT_CC";
- case BPFISD::BR_CC:
- return "BPFISD::BR_CC";
- case BPFISD::Wrapper:
- return "BPFISD::Wrapper";
- case BPFISD::MEMCPY:
- return "BPFISD::MEMCPY";
- }
- return nullptr;
-}
-
static SDValue getTargetNode(ConstantPoolSDNode *N, const SDLoc &DL, EVT Ty,
SelectionDAG &DAG, unsigned Flags) {
return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.h b/llvm/lib/Target/BPF/BPFISelLowering.h
index 5243d49..3d6e7c7 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -20,17 +20,6 @@
namespace llvm {
class BPFSubtarget;
-namespace BPFISD {
-enum NodeType : unsigned {
- FIRST_NUMBER = ISD::BUILTIN_OP_END,
- RET_GLUE,
- CALL,
- SELECT_CC,
- BR_CC,
- Wrapper,
- MEMCPY
-};
-}
class BPFTargetLowering : public TargetLowering {
public:
@@ -39,9 +28,6 @@ public:
// Provide custom lowering hooks for some operations.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
- // This method returns the name of a target specific DAG node.
- const char *getTargetNodeName(unsigned Opcode) const override;
-
// This method decides whether folding a constant offset
// with the given GlobalAddress is legal.
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index 51c32b2..bdacf9c 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -41,14 +41,12 @@ def BPFcallseq_start: SDNode<"ISD::CALLSEQ_START", SDT_BPFCallSeqStart,
[SDNPHasChain, SDNPOutGlue]>;
def BPFcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_BPFCallSeqEnd,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-def BPFbrcc : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue]>;
+def BPFbrcc : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC, [SDNPHasChain]>;
def BPFselectcc : SDNode<"BPFISD::SELECT_CC", SDT_BPFSelectCC>;
def BPFWrapper : SDNode<"BPFISD::Wrapper", SDT_BPFWrapper>;
def BPFmemcpy : SDNode<"BPFISD::MEMCPY", SDT_BPFMEMCPY,
- [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
- SDNPMayStore, SDNPMayLoad]>;
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
def BPFIsLittleEndian : Predicate<"Subtarget->isLittleEndian()">;
def BPFIsBigEndian : Predicate<"!Subtarget->isLittleEndian()">;
def BPFHasALU32 : Predicate<"Subtarget->getHasAlu32()">;
diff --git a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
index d3b0c02..6a11ea6 100644
--- a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
+++ b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
@@ -27,10 +27,6 @@
#define DEBUG_TYPE "bpf-preserve-di-type"
-namespace llvm {
-constexpr StringRef BPFCoreSharedInfo::TypeIdAttr;
-} // namespace llvm
-
using namespace llvm;
namespace {
diff --git a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp
index 3e29e6c..0e6d35d 100644
--- a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp
@@ -10,12 +10,20 @@
//
//===----------------------------------------------------------------------===//
+#include "BPFSelectionDAGInfo.h"
#include "BPFTargetMachine.h"
#include "llvm/CodeGen/SelectionDAG.h"
+
+#define GET_SDNODE_DESC
+#include "BPFGenSDNodeInfo.inc"
+
using namespace llvm;
#define DEBUG_TYPE "bpf-selectiondag-info"
+BPFSelectionDAGInfo::BPFSelectionDAGInfo()
+ : SelectionDAGGenTargetInfo(BPFGenSDNodeInfo) {}
+
SDValue BPFSelectionDAGInfo::EmitTargetCodeForMemcpy(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
@@ -31,11 +39,7 @@ SDValue BPFSelectionDAGInfo::EmitTargetCodeForMemcpy(
if (StoresNumEstimate > getCommonMaxStoresPerMemFunc())
return SDValue();
- SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
-
- Dst = DAG.getNode(BPFISD::MEMCPY, dl, VTs, Chain, Dst, Src,
- DAG.getConstant(CopyLen, dl, MVT::i64),
- DAG.getConstant(Alignment.value(), dl, MVT::i64));
-
- return Dst.getValue(0);
+ return DAG.getNode(BPFISD::MEMCPY, dl, MVT::Other, Chain, Dst, Src,
+ DAG.getConstant(CopyLen, dl, MVT::i64),
+ DAG.getConstant(Alignment.value(), dl, MVT::i64));
}
diff --git a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h
index 79f05e5..7345d2d 100644
--- a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h
+++ b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h
@@ -15,10 +15,15 @@
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#define GET_SDNODE_ENUM
+#include "BPFGenSDNodeInfo.inc"
+
namespace llvm {
-class BPFSelectionDAGInfo : public SelectionDAGTargetInfo {
+class BPFSelectionDAGInfo : public SelectionDAGGenTargetInfo {
public:
+ BPFSelectionDAGInfo();
+
SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, Align Alignment,
@@ -27,9 +32,8 @@ public:
MachinePointerInfo SrcPtrInfo) const override;
unsigned getCommonMaxStoresPerMemFunc() const { return 128; }
-
};
-}
+} // namespace llvm
#endif
diff --git a/llvm/lib/Target/BPF/CMakeLists.txt b/llvm/lib/Target/BPF/CMakeLists.txt
index 3678f13..fa539a0 100644
--- a/llvm/lib/Target/BPF/CMakeLists.txt
+++ b/llvm/lib/Target/BPF/CMakeLists.txt
@@ -10,6 +10,7 @@ tablegen(LLVM BPFGenDisassemblerTables.inc -gen-disassembler)
tablegen(LLVM BPFGenInstrInfo.inc -gen-instr-info)
tablegen(LLVM BPFGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM BPFGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM BPFGenSDNodeInfo.inc -gen-sd-node-info)
tablegen(LLVM BPFGenSubtargetInfo.inc -gen-subtarget)
tablegen(LLVM BPFGenGlobalISel.inc -gen-global-isel)
tablegen(LLVM BPFGenRegisterBank.inc -gen-register-bank)
diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index d507d71..9f1616f 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -304,40 +304,76 @@ bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
GEPOperator *GOp = cast<GEPOperator>(&GEPI);
Value *PtrOperand = GOp->getPointerOperand();
Type *NewGEPType = GOp->getSourceElementType();
- bool NeedsTransform = false;
// Unwrap GEP ConstantExprs to find the base operand and element type
- while (auto *CE = dyn_cast<ConstantExpr>(PtrOperand)) {
- if (auto *GEPCE = dyn_cast<GEPOperator>(CE)) {
- GOp = GEPCE;
- PtrOperand = GEPCE->getPointerOperand();
- NewGEPType = GEPCE->getSourceElementType();
- } else
- break;
+ while (auto *GEPCE = dyn_cast_or_null<GEPOperator>(
+ dyn_cast<ConstantExpr>(PtrOperand))) {
+ GOp = GEPCE;
+ PtrOperand = GEPCE->getPointerOperand();
+ NewGEPType = GEPCE->getSourceElementType();
}
+ Type *const OrigGEPType = NewGEPType;
+ Value *const OrigOperand = PtrOperand;
+
if (GlobalVariable *NewGlobal = lookupReplacementGlobal(PtrOperand)) {
NewGEPType = NewGlobal->getValueType();
PtrOperand = NewGlobal;
- NeedsTransform = true;
} else if (AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrOperand)) {
Type *AllocatedType = Alloca->getAllocatedType();
if (isa<ArrayType>(AllocatedType) &&
- AllocatedType != GOp->getResultElementType()) {
+ AllocatedType != GOp->getResultElementType())
NewGEPType = AllocatedType;
- NeedsTransform = true;
+ } else
+ return false; // Only GEPs into an alloca or global variable are considered
+
+ // Defer changing i8 GEP types until dxil-flatten-arrays
+ if (OrigGEPType->isIntegerTy(8))
+ NewGEPType = OrigGEPType;
+
+ // If the original type is a "sub-type" of the new type, then ensure the gep
+ // correctly zero-indexes the extra dimensions to keep the offset calculation
+ // correct.
+ // Eg:
+ // i32, [4 x i32] and [8 x [4 x i32]] are sub-types of [8 x [4 x i32]], etc.
+ //
+ // So then:
+ // gep [4 x i32] %idx
+ // -> gep [8 x [4 x i32]], i32 0, i32 %idx
+ // gep i32 %idx
+ // -> gep [8 x [4 x i32]], i32 0, i32 0, i32 %idx
+ uint32_t MissingDims = 0;
+ Type *SubType = NewGEPType;
+
+ // The new type will be in its array version; so match accordingly.
+ Type *const GEPArrType = equivalentArrayTypeFromVector(OrigGEPType);
+
+ while (SubType != GEPArrType) {
+ MissingDims++;
+
+ ArrayType *ArrType = dyn_cast<ArrayType>(SubType);
+ if (!ArrType) {
+ assert(SubType == GEPArrType &&
+ "GEP uses an DXIL invalid sub-type of alloca/global variable");
+ break;
}
+
+ SubType = ArrType->getElementType();
}
+ bool NeedsTransform = OrigOperand != PtrOperand ||
+ OrigGEPType != NewGEPType || MissingDims != 0;
+
if (!NeedsTransform)
return false;
- // Keep scalar GEPs scalar; dxil-flatten-arrays will do flattening later
- if (!isa<ArrayType>(GOp->getSourceElementType()))
- NewGEPType = GOp->getSourceElementType();
-
IRBuilder<> Builder(&GEPI);
- SmallVector<Value *, MaxVecSize> Indices(GOp->indices());
+ SmallVector<Value *, MaxVecSize> Indices;
+
+ for (uint32_t I = 0; I < MissingDims; I++)
+ Indices.push_back(Builder.getInt32(0));
+ llvm::append_range(Indices, GOp->indices());
+
Value *NewGEP = Builder.CreateGEP(NewGEPType, PtrOperand, Indices,
GOp->getName(), GOp->getNoWrapFlags());
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index ebb7c26..e0d2dbd 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -197,6 +197,7 @@ static Value *expand16BitIsNormal(CallInst *Orig) {
static bool isIntrinsicExpansion(Function &F) {
switch (F.getIntrinsicID()) {
+ case Intrinsic::assume:
case Intrinsic::abs:
case Intrinsic::atan2:
case Intrinsic::exp:
@@ -988,6 +989,9 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) {
case Intrinsic::abs:
Result = expandAbs(Orig);
break;
+ case Intrinsic::assume:
+ Orig->eraseFromParent();
+ return true;
case Intrinsic::atan2:
Result = expandAtan2Intrinsic(Orig);
break;
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 8720460..e46a393 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -904,8 +904,6 @@ public:
case Intrinsic::dx_resource_casthandle:
// NOTE: llvm.dbg.value is supported as is in DXIL.
case Intrinsic::dbg_value:
- // NOTE: llvm.assume is supported as is in DXIL.
- case Intrinsic::assume:
case Intrinsic::not_intrinsic:
if (F.use_empty())
F.eraseFromParent();
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 47726d6..55bafde 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -4753,6 +4753,19 @@ bool HexagonInstrInfo::getBundleNoShuf(const MachineInstr &MIB) const {
return (Operand.isImm() && (Operand.getImm() & memShufDisabledMask) != 0);
}
+bool HexagonInstrInfo::isQFPMul(const MachineInstr *MI) const {
+ return (MI->getOpcode() == Hexagon::V6_vmpy_qf16_hf ||
+ MI->getOpcode() == Hexagon::V6_vmpy_qf16_mix_hf ||
+ MI->getOpcode() == Hexagon::V6_vmpy_qf32_hf ||
+ MI->getOpcode() == Hexagon::V6_vmpy_qf32_mix_hf ||
+ MI->getOpcode() == Hexagon::V6_vmpy_qf32_sf ||
+ MI->getOpcode() == Hexagon::V6_vmpy_qf16_mix_hf ||
+ MI->getOpcode() == Hexagon::V6_vmpy_qf16 ||
+ MI->getOpcode() == Hexagon::V6_vmpy_qf32_mix_hf ||
+ MI->getOpcode() == Hexagon::V6_vmpy_qf32_qf16 ||
+ MI->getOpcode() == Hexagon::V6_vmpy_qf32);
+}
+
// Addressing mode relations.
short HexagonInstrInfo::changeAddrMode_abs_io(short Opc) const {
return Opc >= 0 ? Hexagon::changeAddrMode_abs_io(Opc) : Opc;
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index c17e527..48adf82 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -532,6 +532,7 @@ public:
}
MCInst getNop() const override;
+ bool isQFPMul(const MachineInstr *MF) const;
};
/// \brief Create RegSubRegPair from a register MachineOperand
diff --git a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
index f29a739..8801f69 100644
--- a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
@@ -58,7 +58,7 @@
// are PHI inst.
//
//===----------------------------------------------------------------------===//
-#include <unordered_set>
+
#define HEXAGON_QFP_OPTIMIZER "QFP optimizer pass"
#include "Hexagon.h"
@@ -86,6 +86,9 @@ using namespace llvm;
cl::opt<bool>
DisableQFOptimizer("disable-qfp-opt", cl::init(false),
cl::desc("Disable optimization of Qfloat operations."));
+cl::opt<bool> DisableQFOptForMul(
+ "disable-qfp-opt-mul", cl::init(true),
+ cl::desc("Disable optimization of Qfloat operations for multiply."));
namespace {
const std::map<unsigned short, unsigned short> QFPInstMap{
@@ -101,11 +104,21 @@ const std::map<unsigned short, unsigned short> QFPInstMap{
{Hexagon::V6_vmpy_qf16_mix_hf, Hexagon::V6_vmpy_qf16},
{Hexagon::V6_vmpy_qf32_hf, Hexagon::V6_vmpy_qf32_mix_hf},
{Hexagon::V6_vmpy_qf32_mix_hf, Hexagon::V6_vmpy_qf32_qf16},
- {Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32}};
+ {Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32},
+ {Hexagon::V6_vilog2_sf, Hexagon::V6_vilog2_qf32},
+ {Hexagon::V6_vilog2_hf, Hexagon::V6_vilog2_qf16},
+ {Hexagon::V6_vabs_qf32_sf, Hexagon::V6_vabs_qf32_qf32},
+ {Hexagon::V6_vabs_qf16_hf, Hexagon::V6_vabs_qf16_qf16},
+ {Hexagon::V6_vneg_qf32_sf, Hexagon::V6_vneg_qf32_qf32},
+ {Hexagon::V6_vneg_qf16_hf, Hexagon::V6_vneg_qf16_qf16}};
} // namespace
-namespace {
+namespace llvm {
+FunctionPass *createHexagonQFPOptimizer();
+void initializeHexagonQFPOptimizerPass(PassRegistry &);
+} // namespace llvm
+namespace {
struct HexagonQFPOptimizer : public MachineFunctionPass {
public:
static char ID;
@@ -116,6 +129,10 @@ public:
bool optimizeQfp(MachineInstr *MI, MachineBasicBlock *MBB);
+ bool optimizeQfpTwoOp(MachineInstr *MI, MachineBasicBlock *MBB);
+
+ bool optimizeQfpOneOp(MachineInstr *MI, MachineBasicBlock *MBB);
+
StringRef getPassName() const override { return HEXAGON_QFP_OPTIMIZER; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -142,19 +159,69 @@ FunctionPass *llvm::createHexagonQFPOptimizer() {
bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
MachineBasicBlock *MBB) {
- // Early exit:
- // - if instruction is invalid or has too few operands (QFP ops need 2 sources
- // + 1 dest),
- // - or does not have a transformation mapping.
- if (MI->getNumOperands() < 3)
+ if (MI->getNumOperands() == 2)
+ return optimizeQfpOneOp(MI, MBB);
+ else if (MI->getNumOperands() == 3)
+ return optimizeQfpTwoOp(MI, MBB);
+ else
return false;
+}
+
+bool HexagonQFPOptimizer::optimizeQfpOneOp(MachineInstr *MI,
+ MachineBasicBlock *MBB) {
+
+ unsigned Op0F = 0;
auto It = QFPInstMap.find(MI->getOpcode());
if (It == QFPInstMap.end())
return false;
+
unsigned short InstTy = It->second;
+ // Get the reachind defs of MI
+ MachineInstr *DefMI = MRI->getVRegDef(MI->getOperand(1).getReg());
+ MachineOperand &Res = MI->getOperand(0);
+ if (!Res.isReg())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "\n[Reaching Defs of operands]: "; DefMI->dump());
+ MachineInstr *ReachDefDef = nullptr;
+
+ // Get the reaching def of the reaching def to check for W reg def
+ if (DefMI->getNumOperands() > 1 && DefMI->getOperand(1).isReg() &&
+ DefMI->getOperand(1).getReg().isVirtual())
+ ReachDefDef = MRI->getVRegDef(DefMI->getOperand(1).getReg());
+ unsigned ReachDefOp = DefMI->getOpcode();
+ MachineInstrBuilder MIB;
+
+ // Check if the reaching def is a conversion
+ if (ReachDefOp == Hexagon::V6_vconv_sf_qf32 ||
+ ReachDefOp == Hexagon::V6_vconv_hf_qf16) {
+
+ // Return if the reaching def of reaching def is W type
+ if (ReachDefDef && MRI->getRegClass(ReachDefDef->getOperand(0).getReg()) ==
+ &Hexagon::HvxWRRegClass)
+ return false;
+
+ // Analyze the use operands of the conversion to get their KILL status
+ MachineOperand &SrcOp = DefMI->getOperand(1);
+ Op0F = getKillRegState(SrcOp.isKill());
+ SrcOp.setIsKill(false);
+ MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+ .addReg(SrcOp.getReg(), Op0F, SrcOp.getSubReg());
+ LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
+ return true;
+ }
+ return false;
+}
+
+bool HexagonQFPOptimizer::optimizeQfpTwoOp(MachineInstr *MI,
+ MachineBasicBlock *MBB) {
unsigned Op0F = 0;
unsigned Op1F = 0;
+ auto It = QFPInstMap.find(MI->getOpcode());
+ if (It == QFPInstMap.end())
+ return false;
+ unsigned short InstTy = It->second;
// Get the reaching defs of MI, DefMI1 and DefMI2
MachineInstr *DefMI1 = nullptr;
MachineInstr *DefMI2 = nullptr;
@@ -167,6 +234,9 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
return false;
MachineOperand &Res = MI->getOperand(0);
+ if (!Res.isReg())
+ return false;
+
MachineInstr *Inst1 = nullptr;
MachineInstr *Inst2 = nullptr;
LLVM_DEBUG(dbgs() << "\n[Reaching Defs of operands]: "; DefMI1->dump();
@@ -185,7 +255,8 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
unsigned Def2OP = DefMI2->getOpcode();
MachineInstrBuilder MIB;
- // Case 1: Both reaching defs of MI are qf to sf/hf conversions
+
+ // Check if the both the reaching defs of MI are qf to sf/hf conversions
if ((Def1OP == Hexagon::V6_vconv_sf_qf32 &&
Def2OP == Hexagon::V6_vconv_sf_qf32) ||
(Def1OP == Hexagon::V6_vconv_hf_qf16 &&
@@ -226,7 +297,7 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
return true;
- // Case 2: Left operand is conversion to sf/hf
+ // Check if left operand's reaching def is a conversion to sf/hf
} else if (((Def1OP == Hexagon::V6_vconv_sf_qf32 &&
Def2OP != Hexagon::V6_vconv_sf_qf32) ||
(Def1OP == Hexagon::V6_vconv_hf_qf16 &&
@@ -250,7 +321,7 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
return true;
- // Case 2: Left operand is conversion to sf/hf
+ // Check if right operand's reaching def is a conversion to sf/hf
} else if (((Def1OP != Hexagon::V6_vconv_sf_qf32 &&
Def2OP == Hexagon::V6_vconv_sf_qf32) ||
(Def1OP != Hexagon::V6_vconv_hf_qf16 &&
@@ -258,13 +329,6 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
!DefMI1->isPHI() &&
(MI->getOpcode() != Hexagon::V6_vmpy_qf32_sf)) {
// The second operand of original instruction is converted.
- // In "mix" instructions, "qf" operand is always the first operand.
-
- // Caveat: vsub is not commutative w.r.t operands.
- if (InstTy == Hexagon::V6_vsub_qf16_mix ||
- InstTy == Hexagon::V6_vsub_qf32_mix)
- return false;
-
if (Inst2 && MRI->getRegClass(Inst2->getOperand(0).getReg()) ==
&Hexagon::HvxWRRegClass)
return false;
@@ -275,10 +339,26 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
Op1F = getKillRegState(Src2.isKill());
Src2.setIsKill(false);
Op0F = getKillRegState(Src1.isKill());
- MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
- .addReg(Src2.getReg(), Op1F,
- Src2.getSubReg()) // Notice the operands are flipped.
- .addReg(Src1.getReg(), Op0F, Src1.getSubReg());
+ if (InstTy == Hexagon::V6_vsub_qf16_mix ||
+ InstTy == Hexagon::V6_vsub_qf32_mix) {
+ if (!HST->useHVXV81Ops())
+ // vsub_(hf|sf)_mix insts are only avlbl on hvx81+
+ return false;
+ // vsub is not commutative w.r.t. operands -> treat it as a special case
+ // to choose the correct mix instruction.
+ if (Def2OP == Hexagon::V6_vconv_sf_qf32)
+ InstTy = Hexagon::V6_vsub_sf_mix;
+ else if (Def2OP == Hexagon::V6_vconv_hf_qf16)
+ InstTy = Hexagon::V6_vsub_hf_mix;
+ MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+ .addReg(Src1.getReg(), Op0F, Src1.getSubReg())
+ .addReg(Src2.getReg(), Op1F, Src2.getSubReg());
+ } else {
+ MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+ .addReg(Src2.getReg(), Op1F,
+ Src2.getSubReg()) // Notice the operands are flipped.
+ .addReg(Src1.getReg(), Op0F, Src1.getSubReg());
+ }
LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
return true;
}
@@ -309,15 +389,18 @@ bool HexagonQFPOptimizer::runOnMachineFunction(MachineFunction &MF) {
while (MII != MBBI->instr_end()) {
MachineInstr *MI = &*MII;
++MII; // As MI might be removed.
-
- if (QFPInstMap.count(MI->getOpcode()) &&
- MI->getOpcode() != Hexagon::V6_vconv_sf_qf32 &&
- MI->getOpcode() != Hexagon::V6_vconv_hf_qf16) {
- LLVM_DEBUG(dbgs() << "\n###Analyzing for removal: "; MI->dump());
- if (optimizeQfp(MI, MBB)) {
- MI->eraseFromParent();
- LLVM_DEBUG(dbgs() << "\t....Removing....");
- Changed = true;
+ if (QFPInstMap.count(MI->getOpcode())) {
+ auto OpC = MI->getOpcode();
+ if (DisableQFOptForMul && HII->isQFPMul(MI))
+ continue;
+ if (OpC != Hexagon::V6_vconv_sf_qf32 &&
+ OpC != Hexagon::V6_vconv_hf_qf16) {
+ LLVM_DEBUG(dbgs() << "\n###Analyzing for removal: "; MI->dump());
+ if (optimizeQfp(MI, MBB)) {
+ MI->eraseFromParent();
+ LLVM_DEBUG(dbgs() << "\t....Removing....");
+ Changed = true;
+ }
}
}
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 2f1a7ad..a3deb36 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -305,7 +305,8 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
uint64_t StartingOffset = 0) {
SmallVector<EVT, 16> TempVTs;
SmallVector<uint64_t, 16> TempOffsets;
- ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
+ ComputeValueVTs(TLI, DL, Ty, TempVTs, /*MemVTs=*/nullptr, &TempOffsets,
+ StartingOffset);
for (const auto [VT, Off] : zip(TempVTs, TempOffsets)) {
MVT RegisterVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 780e124..122738c 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -2750,6 +2750,10 @@ void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
if (isSpecialLLVMGlobalArrayToSkip(GV) || isSpecialLLVMGlobalArrayForStaticInit(GV))
return;
+ // Ignore non-emitted data.
+ if (GV->getSection() == "llvm.metadata")
+ return;
+
// If the Global Variable has the toc-data attribute, it needs to be emitted
// when we emit the .toc section.
if (GV->hasAttribute("toc-data")) {
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index b37b740..f881c4c 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -789,6 +789,8 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
// Unroll the probe loop depending on the number of iterations.
if (Offset < ProbeSize * 5) {
+ uint64_t CFAAdjust = RealStackSize - Offset;
+
uint64_t CurrentOffset = 0;
while (CurrentOffset + ProbeSize <= Offset) {
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
@@ -802,7 +804,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
CurrentOffset += ProbeSize;
if (EmitCFI)
- CFIBuilder.buildDefCFAOffset(CurrentOffset);
+ CFIBuilder.buildDefCFAOffset(CurrentOffset + CFAAdjust);
}
uint64_t Residual = Offset - CurrentOffset;
@@ -810,7 +812,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
StackOffset::getFixed(-Residual), Flag, getStackAlign());
if (EmitCFI)
- CFIBuilder.buildDefCFAOffset(Offset);
+ CFIBuilder.buildDefCFAOffset(RealStackSize);
if (DynAllocation) {
// s[d|w] zero, 0(sp)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 995ae75..3b69eda 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -17867,6 +17867,7 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N,
SmallVector<SDNode *> Worklist;
SmallPtrSet<SDNode *, 8> Inserted;
+ SmallPtrSet<SDNode *, 8> ExtensionsToRemove;
Worklist.push_back(N);
Inserted.insert(N);
SmallVector<CombineResult> CombinesToApply;
@@ -17876,22 +17877,25 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N,
NodeExtensionHelper LHS(Root, 0, DAG, Subtarget);
NodeExtensionHelper RHS(Root, 1, DAG, Subtarget);
- auto AppendUsersIfNeeded = [&Worklist, &Subtarget,
- &Inserted](const NodeExtensionHelper &Op) {
- if (Op.needToPromoteOtherUsers()) {
- for (SDUse &Use : Op.OrigOperand->uses()) {
- SDNode *TheUser = Use.getUser();
- if (!NodeExtensionHelper::isSupportedRoot(TheUser, Subtarget))
- return false;
- // We only support the first 2 operands of FMA.
- if (Use.getOperandNo() >= 2)
- return false;
- if (Inserted.insert(TheUser).second)
- Worklist.push_back(TheUser);
- }
- }
- return true;
- };
+ auto AppendUsersIfNeeded =
+ [&Worklist, &Subtarget, &Inserted,
+ &ExtensionsToRemove](const NodeExtensionHelper &Op) {
+ if (Op.needToPromoteOtherUsers()) {
+ // Remember that we're supposed to remove this extension.
+ ExtensionsToRemove.insert(Op.OrigOperand.getNode());
+ for (SDUse &Use : Op.OrigOperand->uses()) {
+ SDNode *TheUser = Use.getUser();
+ if (!NodeExtensionHelper::isSupportedRoot(TheUser, Subtarget))
+ return false;
+ // We only support the first 2 operands of FMA.
+ if (Use.getOperandNo() >= 2)
+ return false;
+ if (Inserted.insert(TheUser).second)
+ Worklist.push_back(TheUser);
+ }
+ }
+ return true;
+ };
// Control the compile time by limiting the number of node we look at in
// total.
@@ -17912,6 +17916,15 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N,
std::optional<CombineResult> Res =
FoldingStrategy(Root, LHS, RHS, DAG, Subtarget);
if (Res) {
+ // If this strategy wouldn't remove an extension we're supposed to
+ // remove, reject it.
+ if (!Res->LHSExt.has_value() &&
+ ExtensionsToRemove.contains(LHS.OrigOperand.getNode()))
+ continue;
+ if (!Res->RHSExt.has_value() &&
+ ExtensionsToRemove.contains(RHS.OrigOperand.getNode()))
+ continue;
+
Matched = true;
CombinesToApply.push_back(*Res);
// All the inputs that are extended need to be folded, otherwise
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index 24ebbc3..41071b2 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -654,8 +654,17 @@ foreach mx = SchedMxList in {
foreach sew = SchedSEWSet<mx>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ defvar VIRedLat = GetLMULValue<[5, 5, 5, 7, 11, 19, 35], mx>.c;
+ defvar VIRedOcc = GetLMULValue<[1, 1, 2, 2, 4, 10, 35], mx>.c;
+ let Latency = VIRedLat, ReleaseAtCycles = [VIRedOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+
+ // Pattern for vredsum: 5/5/5/7/11/19/35
+ // Pattern for vredand, vredor, vredxor: 4/4/4/6/10/18/34
+ // They are grouped together, so we use the worst-case vredsum latency.
+ // TODO: split vredand, vredor, vredxor into separate scheduling classe.
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ }
}
}
@@ -663,7 +672,27 @@ foreach mx = SchedMxListWRed in {
foreach sew = SchedSEWSet<mx, 0, 1>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ defvar VIRedLat = GetLMULValue<[5, 5, 5, 7, 11, 19, 35], mx>.c;
+ defvar VIRedOcc = GetLMULValue<[1, 1, 2, 2, 4, 10, 35], mx>.c;
+ let Latency = VIRedLat, ReleaseAtCycles = [VIRedOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+foreach mx = SchedMxListF in {
+ foreach sew = SchedSEWSet<mx, 1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+
+ // Latency for vfredmax.vs, vfredmin.vs: 12/12/15/21/33/57
+ // Latency for vfredusum.vs is slightly lower for e16/e32
+ // We use the worst-case
+ defvar VFRedLat = GetLMULValue<[12, 12, 12, 15, 21, 33, 57], mx>.c;
+ defvar VFRedOcc = GetLMULValue<[8, 8, 8, 8, 14, 20, 57], mx>.c;
+ let Latency = VFRedLat, ReleaseAtCycles = [VFRedOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
}
}
@@ -671,9 +700,20 @@ foreach mx = SchedMxListF in {
foreach sew = SchedSEWSet<mx, 1>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ // Compute latency based on SEW
+ defvar VFRedOV_FromLat = !cond(
+ !eq(sew, 16) : ConstValueUntilLMULThenDouble<"MF4", 12, mx>.c,
+ !eq(sew, 32) : ConstValueUntilLMULThenDouble<"MF2", 12, mx>.c,
+ !eq(sew, 64) : ConstValueUntilLMULThenDouble<"M1", 12, mx>.c
+ );
+ defvar VFRedOV_FromOcc = !cond(
+ !eq(sew, 16) : GetLMULValue<[8, 8, 20, 24, 48, 96, 384], mx>.c,
+ !eq(sew, 32) : GetLMULValue<[8, 8, 8, 12, 24, 48, 192], mx>.c,
+ !eq(sew, 64) : GetLMULValue<[6, 6, 6, 6, 12, 24, 96], mx>.c
+ );
+ let Latency = VFRedOV_FromLat, ReleaseAtCycles = [VFRedOV_FromOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
}
}
@@ -681,8 +721,18 @@ foreach mx = SchedMxListFWRed in {
foreach sew = SchedSEWSet<mx, 1, 1>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defvar VFRedOVLat = !cond(
+ !eq(sew, 16) : ConstValueUntilLMULThenDouble<"MF4", 16, mx>.c,
+ !eq(sew, 32) : ConstValueUntilLMULThenDouble<"MF2", 16, mx>.c,
+ );
+ defvar VFRedOVOcc = !cond(
+ !eq(sew, 16) : GetLMULValue<[11, 11, 27, 32, 64, 128, 512], mx>.c,
+ !eq(sew, 32) : GetLMULValue<[11, 11, 11, 16, 32, 64, 256], mx>.c,
+ );
+ let Latency = VFRedOVLat, ReleaseAtCycles = [VFRedOVOcc] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
}
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 168e041..d103953 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53354,6 +53354,7 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
// i32 sub value.
static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
using namespace SDPatternMatch;
SDValue StoredVal = St->getValue();
@@ -53451,6 +53452,8 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
if (!StoredVal.hasOneUse()) {
SDValue NewLoad =
DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand());
+ for (SDNode *User : StoredVal->users())
+ DCI.AddToWorklist(User);
DAG.ReplaceAllUsesWith(StoredVal, NewLoad);
}
return NewStore;
@@ -53682,7 +53685,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
}
}
- if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget))
+ if (SDValue R = narrowBitOpRMW(St, dl, DAG, DCI, Subtarget))
return R;
// Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 975a271..96bef0e 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -174,8 +174,8 @@ constexpr GPUInfo AMDGCNGPUs[] = {
{{"gfx1153"}, {"gfx1153"}, GK_GFX1153, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
{{"gfx1200"}, {"gfx1200"}, GK_GFX1200, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
{{"gfx1201"}, {"gfx1201"}, GK_GFX1201, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
- {{"gfx1250"}, {"gfx1250"}, GK_GFX1250, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
- {{"gfx1251"}, {"gfx1251"}, GK_GFX1251, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
+ {{"gfx1250"}, {"gfx1250"}, GK_GFX1250, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK_ALWAYS},
+ {{"gfx1251"}, {"gfx1251"}, GK_GFX1251, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK_ALWAYS},
{{"gfx9-generic"}, {"gfx9-generic"}, GK_GFX9_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
{{"gfx10-1-generic"}, {"gfx10-1-generic"}, GK_GFX10_1_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP},
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index af53fa0..02f06be 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -734,7 +734,7 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
FunctionHash = (((uint64_t)JCH.getCRC()) << 28) + JC.getCRC();
// Reserve bit 60-63 for other information purpose.
- FunctionHash &= 0x0FFFFFFFFFFFFFFF;
+ FunctionHash &= NamedInstrProfRecord::FUNC_HASH_MASK;
if (IsCS)
NamedInstrProfRecord::setCSFlagInHash(FunctionHash);
LLVM_DEBUG(dbgs() << "Function Hash Computation for " << F.getName() << ":\n"
diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 42b1fdf..8aa8aa2 100644
--- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -39,36 +39,36 @@ using namespace llvm;
STATISTIC(NumBroken, "Number of blocks inserted");
namespace {
- struct BreakCriticalEdges : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- BreakCriticalEdges() : FunctionPass(ID) {
- initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry());
- }
+struct BreakCriticalEdges : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ BreakCriticalEdges() : FunctionPass(ID) {
+ initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry());
+ }
- bool runOnFunction(Function &F) override {
- auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ bool runOnFunction(Function &F) override {
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
- auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
- auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+ auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
+ auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
- auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
- auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
- unsigned N =
- SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI, nullptr, PDT));
- NumBroken += N;
- return N > 0;
- }
+ auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+ auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+ unsigned N = SplitAllCriticalEdges(
+ F, CriticalEdgeSplittingOptions(DT, LI, nullptr, PDT));
+ NumBroken += N;
+ return N > 0;
+ }
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
- // No loop canonicalization guarantees are broken by this pass.
- AU.addPreservedID(LoopSimplifyID);
- }
- };
-}
+ // No loop canonicalization guarantees are broken by this pass.
+ AU.addPreservedID(LoopSimplifyID);
+ }
+};
+} // namespace
char BreakCriticalEdges::ID = 0;
INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges",
@@ -76,6 +76,7 @@ INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges",
// Publicly exposed interface to pass...
char &llvm::BreakCriticalEdgesID = BreakCriticalEdges::ID;
+
FunctionPass *llvm::createBreakCriticalEdgesPass() {
return new BreakCriticalEdges();
}
diff --git a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
index 7343c79..9f6d89e 100644
--- a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -40,22 +40,22 @@ using namespace llvm;
namespace {
- struct QuotRemPair {
- Value *Quotient;
- Value *Remainder;
-
- QuotRemPair(Value *InQuotient, Value *InRemainder)
- : Quotient(InQuotient), Remainder(InRemainder) {}
- };
-
- /// A quotient and remainder, plus a BB from which they logically "originate".
- /// If you use Quotient or Remainder in a Phi node, you should use BB as its
- /// corresponding predecessor.
- struct QuotRemWithBB {
- BasicBlock *BB = nullptr;
- Value *Quotient = nullptr;
- Value *Remainder = nullptr;
- };
+struct QuotRemPair {
+ Value *Quotient;
+ Value *Remainder;
+
+ QuotRemPair(Value *InQuotient, Value *InRemainder)
+ : Quotient(InQuotient), Remainder(InRemainder) {}
+};
+
+/// A quotient and remainder, plus a BB from which they logically "originate".
+/// If you use Quotient or Remainder in a Phi node, you should use BB as its
+/// corresponding predecessor.
+struct QuotRemWithBB {
+ BasicBlock *BB = nullptr;
+ Value *Quotient = nullptr;
+ Value *Remainder = nullptr;
+};
using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>;
using BypassWidthsTy = DenseMap<unsigned, unsigned>;
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 61ffb49..8da6a980 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -378,7 +378,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
if (P != Preheader) BackedgeBlocks.push_back(P);
}
- // Create and insert the new backedge block...
+ // Create and insert the new backedge block.
BasicBlock *BEBlock = BasicBlock::Create(Header->getContext(),
Header->getName() + ".backedge", F);
BranchInst *BETerminator = BranchInst::Create(Header, BEBlock);
@@ -737,39 +737,39 @@ bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI,
}
namespace {
- struct LoopSimplify : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- LoopSimplify() : FunctionPass(ID) {
- initializeLoopSimplifyPass(*PassRegistry::getPassRegistry());
- }
+struct LoopSimplify : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ LoopSimplify() : FunctionPass(ID) {
+ initializeLoopSimplifyPass(*PassRegistry::getPassRegistry());
+ }
- bool runOnFunction(Function &F) override;
+ bool runOnFunction(Function &F) override;
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
- // We need loop information to identify the loops...
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
+ // We need loop information to identify the loops.
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
- AU.addPreserved<BasicAAWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addPreserved<ScalarEvolutionWrapperPass>();
- AU.addPreserved<SCEVAAWrapperPass>();
- AU.addPreservedID(LCSSAID);
- AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added.
- AU.addPreserved<BranchProbabilityInfoWrapperPass>();
- AU.addPreserved<MemorySSAWrapperPass>();
- }
+ AU.addPreserved<BasicAAWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<SCEVAAWrapperPass>();
+ AU.addPreservedID(LCSSAID);
+ AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added.
+ AU.addPreserved<BranchProbabilityInfoWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
+ }
- /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees.
- void verifyAnalysis() const override;
- };
-}
+ /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees.
+ void verifyAnalysis() const override;
+};
+} // namespace
char LoopSimplify::ID = 0;
INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify",
@@ -780,12 +780,12 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", "Canonicalize natural loops",
false, false)
-// Publicly exposed interface to pass...
+// Publicly exposed interface to pass.
char &llvm::LoopSimplifyID = LoopSimplify::ID;
Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
/// runOnFunction - Run down all loops in the CFG (recursively, but we could do
-/// it in any convenient order) inserting preheaders...
+/// it in any convenient order) inserting preheaders.
///
bool LoopSimplify::runOnFunction(Function &F) {
bool Changed = false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 906fa2f..b7224a3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7933,6 +7933,26 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
(!Chain.ExtendB || ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)))
ScaledReductionMap.try_emplace(Chain.Reduction, Pair.second);
}
+
+ // Check that all partial reductions in a chain are only used by other
+ // partial reductions with the same scale factor. Otherwise we end up creating
+ // users of scaled reductions where the types of the other operands don't
+ // match.
+ for (const auto &[Chain, Scale] : PartialReductionChains) {
+ auto AllUsersPartialRdx = [ScaleVal = Scale, this](const User *U) {
+ auto *UI = cast<Instruction>(U);
+ if (isa<PHINode>(UI) && UI->getParent() == OrigLoop->getHeader()) {
+ return all_of(UI->users(), [ScaleVal, this](const User *U) {
+ auto *UI = cast<Instruction>(U);
+ return ScaledReductionMap.lookup_or(UI, 0) == ScaleVal;
+ });
+ }
+ return ScaledReductionMap.lookup_or(UI, 0) == ScaleVal ||
+ !OrigLoop->contains(UI->getParent());
+ };
+ if (!all_of(Chain.Reduction->users(), AllUsersPartialRdx))
+ ScaledReductionMap.erase(Chain.Reduction);
+ }
}
bool VPRecipeBuilder::getScaledReductions(
@@ -8116,11 +8136,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return tryToWidenMemory(Instr, Operands, Range);
- if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr)) {
- if (auto PartialRed =
- tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()))
- return PartialRed;
- }
+ if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
+ return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
if (!shouldWiden(Instr, Range))
return nullptr;
@@ -8154,9 +8171,9 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
isa<VPPartialReductionRecipe>(BinOpRecipe))
std::swap(BinOp, Accumulator);
- if (ScaleFactor !=
- vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()))
- return nullptr;
+ assert(ScaleFactor ==
+ vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()) &&
+ "all accumulators in chain must have same scale factor");
unsigned ReductionOpcode = Reduction->getOpcode();
if (ReductionOpcode == Instruction::Sub) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index bf3f52c..df835a0 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -20996,6 +20996,15 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
return false;
}))
return std::nullopt;
+ if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
+ EI.UserTE->hasCopyableElements() &&
+ EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
+ all_of(VL, [&](Value *V) {
+ if (S.isCopyableElement(V))
+ return true;
+ return isUsedOutsideBlock(V);
+ }))
+ return std::nullopt;
bool HasCopyables = S.areInstructionsWithCopyableElements();
if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {
diff --git a/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll b/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll
index 4346507..181a449 100644
--- a/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll
@@ -210,7 +210,7 @@ define void @t3(i64 %n, i64 %m, i64 %lb, ptr %a) {
; CHECK-NEXT: Src: %2 = load i32, ptr %arrayidx6, align 4 --> Dst: %2 = load i32, ptr %arrayidx6, align 4
; CHECK-NEXT: da analyze - none!
; CHECK-NEXT: Src: %2 = load i32, ptr %arrayidx6, align 4 --> Dst: store i32 %2, ptr %arrayidx8, align 4
-; CHECK-NEXT: da analyze - consistent anti [1 -2]!
+; CHECK-NEXT: da analyze - anti [1 *]!
; CHECK-NEXT: Src: store i32 %2, ptr %arrayidx8, align 4 --> Dst: store i32 %2, ptr %arrayidx8, align 4
; CHECK-NEXT: da analyze - none!
;
diff --git a/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll b/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll
index 44bd9b7..71b9382 100644
--- a/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -disable-output "-passes=print<da>" -aa-pipeline=basic-aa 2>&1 \
-; RUN: | FileCheck %s
+; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-ALL
+; RUN: opt < %s -disable-output "-passes=print<da>" -aa-pipeline=basic-aa -da-enable-dependence-test=strong-siv 2>&1 \
+; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-STRONG-SIV
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.6.0"
@@ -423,19 +425,33 @@ for.end: ; preds = %for.body
;; *B++ = A[i + 2*n];
define void @strong9(ptr %A, ptr %B, i64 %n) nounwind uwtable ssp {
-; CHECK-LABEL: 'strong9'
-; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
-; CHECK-NEXT: da analyze - none!
-; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
-; CHECK-NEXT: da analyze - none!
-; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4
-; CHECK-NEXT: da analyze - confused!
-; CHECK-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
-; CHECK-NEXT: da analyze - none!
-; CHECK-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4
-; CHECK-NEXT: da analyze - confused!
-; CHECK-NEXT: Src: store i32 %0, ptr %B.addr.02, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4
-; CHECK-NEXT: da analyze - none!
+; CHECK-ALL-LABEL: 'strong9'
+; CHECK-ALL-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
+; CHECK-ALL-NEXT: da analyze - none!
+; CHECK-ALL-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-ALL-NEXT: da analyze - none!
+; CHECK-ALL-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4
+; CHECK-ALL-NEXT: da analyze - confused!
+; CHECK-ALL-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-ALL-NEXT: da analyze - none!
+; CHECK-ALL-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4
+; CHECK-ALL-NEXT: da analyze - confused!
+; CHECK-ALL-NEXT: Src: store i32 %0, ptr %B.addr.02, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4
+; CHECK-ALL-NEXT: da analyze - none!
+;
+; CHECK-STRONG-SIV-LABEL: 'strong9'
+; CHECK-STRONG-SIV-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
+; CHECK-STRONG-SIV-NEXT: da analyze - none!
+; CHECK-STRONG-SIV-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-STRONG-SIV-NEXT: da analyze - flow [*|<]!
+; CHECK-STRONG-SIV-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4
+; CHECK-STRONG-SIV-NEXT: da analyze - confused!
+; CHECK-STRONG-SIV-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-STRONG-SIV-NEXT: da analyze - none!
+; CHECK-STRONG-SIV-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4
+; CHECK-STRONG-SIV-NEXT: da analyze - confused!
+; CHECK-STRONG-SIV-NEXT: Src: store i32 %0, ptr %B.addr.02, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4
+; CHECK-STRONG-SIV-NEXT: da analyze - none!
;
entry:
%cmp1 = icmp eq i64 %n, 0
@@ -512,3 +528,45 @@ for.body: ; preds = %entry, %for.body
for.end: ; preds = %for.body
ret void
}
+
+
+;; for (long unsigned i = 0; i < 9223372036854775806; i++)
+;; for (long unsigned j = 0; j < 2147483640; j++)
+;; if (i < 3000000000)
+;; A[i] = 0;
+;
+; FIXME: DependenceAnalysis fails to detect the dependency between A[i] and
+; itself, and the issue is not caused by the Strong SIV.
+define void @strong11(ptr %A) nounwind uwtable ssp {
+; CHECK-ALL-LABEL: 'strong11'
+; CHECK-ALL-NEXT: Src: store i32 0, ptr %arrayidx, align 4 --> Dst: store i32 0, ptr %arrayidx, align 4
+; CHECK-ALL-NEXT: da analyze - none!
+;
+; CHECK-STRONG-SIV-LABEL: 'strong11'
+; CHECK-STRONG-SIV-NEXT: Src: store i32 0, ptr %arrayidx, align 4 --> Dst: store i32 0, ptr %arrayidx, align 4
+; CHECK-STRONG-SIV-NEXT: da analyze - consistent output [0 S]!
+;
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %entry, %for.cond.cleanup3
+ %i.017 = phi i64 [ 0, %entry ], [ %inc8, %for.cond.cleanup3 ]
+ %cmp5 = icmp samesign ult i64 %i.017, 3000000000
+ %arrayidx = getelementptr inbounds nuw i32, ptr %A, i64 %i.017
+ br i1 %cmp5, label %for.body4.us, label %for.cond.cleanup3
+
+for.body4.us: ; preds = %for.cond1.preheader, %for.body4.us
+ %j.016.us = phi i64 [ %inc.us, %for.body4.us ], [ 0, %for.cond1.preheader ]
+ store i32 0, ptr %arrayidx, align 4
+ %inc.us = add nuw nsw i64 %j.016.us, 1
+ %exitcond.not = icmp eq i64 %inc.us, 2147483640
+ br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4.us
+
+for.cond.cleanup: ; preds = %for.cond.cleanup3
+ ret void
+
+for.cond.cleanup3: ; preds = %for.body4.us, %for.cond1.preheader
+ %inc8 = add nuw nsw i64 %i.017, 1
+ %exitcond19.not = icmp eq i64 %inc8, 9223372036854775806
+ br i1 %exitcond19.not, label %for.cond.cleanup, label %for.cond1.preheader
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/same-sd-for-diff-becount-type-loops.ll b/llvm/test/Analysis/DependenceAnalysis/same-sd-for-diff-becount-type-loops.ll
index 66880b5..f7f869d 100644
--- a/llvm/test/Analysis/DependenceAnalysis/same-sd-for-diff-becount-type-loops.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/same-sd-for-diff-becount-type-loops.ll
@@ -1,12 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
; RUN: opt < %s -disable-output "-passes=print<da>" -aa-pipeline=basic-aa 2>&1 | FileCheck %s
define void @f1() {
; CHECK-LABEL: 'f1'
-; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: store i32 0, ptr null, align 4
+; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: store i32 0, ptr null, align 4
; CHECK-NEXT: da analyze - consistent output [S]!
-; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: %2 = load i32, ptr null, align 4
+; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: %2 = load i32, ptr null, align 4
; CHECK-NEXT: da analyze - consistent flow [|<]!
-; CHECK-NEXT: Src: %2 = load i32, ptr null, align 4 --> Dst: %2 = load i32, ptr null, align 4
+; CHECK-NEXT: Src: %2 = load i32, ptr null, align 4 --> Dst: %2 = load i32, ptr null, align 4
; CHECK-NEXT: da analyze - consistent input [S]!
;
entry:
@@ -34,11 +35,11 @@ exit: ; preds = %for.2.body
define void @f2() {
; CHECK-LABEL: 'f2'
-; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: store i32 0, ptr null, align 4
+; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: store i32 0, ptr null, align 4
; CHECK-NEXT: da analyze - consistent output [S]!
-; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: %3 = load i32, ptr null, align 4
-; CHECK-NEXT: da analyze - flow [|<] / assuming 1 loop level(s) fused: [S|<]!
-; CHECK-NEXT: Src: %3 = load i32, ptr null, align 4 --> Dst: %3 = load i32, ptr null, align 4
+; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: %3 = load i32, ptr null, align 4
+; CHECK-NEXT: da analyze - flow [|<] / assuming 1 loop level(s) fused: [S|<]!
+; CHECK-NEXT: Src: %3 = load i32, ptr null, align 4 --> Dst: %3 = load i32, ptr null, align 4
; CHECK-NEXT: da analyze - consistent input [S]!
;
entry:
diff --git a/llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll b/llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll
index bf0fafc..6fd71ac 100644
--- a/llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll
@@ -12,19 +12,24 @@
; A[2*i - 4] = 2;
; }
;
-; FIXME: DependenceAnalysis currently detects no dependency between the two
-; stores, but it does exist. For example, each store will access A[0] when i
-; is 1 and 2 respectively.
-; The root cause is that the product of the BTC and the coefficient
-; ((1LL << 62) - 1 and 2) overflows in a signed sense.
+; FIXME: DependenceAnalysis fails to detect the dependency between the two
+; stores, and the issue is not caused by the Strong SIV.
define void @strongsiv_const_ovfl(ptr %A) {
-; CHECK-LABEL: 'strongsiv_const_ovfl'
-; CHECK-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
-; CHECK-NEXT: da analyze - none!
-; CHECK-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
-; CHECK-NEXT: da analyze - none!
-; CHECK-NEXT: Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
-; CHECK-NEXT: da analyze - none!
+; CHECK-ALL-LABEL: 'strongsiv_const_ovfl'
+; CHECK-ALL-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-ALL-NEXT: da analyze - none!
+; CHECK-ALL-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT: da analyze - none!
+; CHECK-ALL-NEXT: Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT: da analyze - none!
+;
+; CHECK-STRONG-SIV-LABEL: 'strongsiv_const_ovfl'
+; CHECK-STRONG-SIV-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-STRONG-SIV-NEXT: da analyze - none!
+; CHECK-STRONG-SIV-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-STRONG-SIV-NEXT: da analyze - consistent output [1]!
+; CHECK-STRONG-SIV-NEXT: Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-STRONG-SIV-NEXT: da analyze - none!
;
entry:
br label %loop.header
@@ -64,5 +69,4 @@ exit:
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-ALL: {{.*}}
-; CHECK-STRONG-SIV: {{.*}}
+; CHECK: {{.*}}
diff --git a/llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll b/llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll
index c5ff988..75be963 100644
--- a/llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll
@@ -13,7 +13,7 @@
; FIXME: DependenceAnalysis currently detects no dependency between the two
; stores, but it does exist. For example, each store will access A[0] when i
; is 1 and 0 respectively.
-; The root cause is that the product of the BTC and the coefficient
+; The root cause is that the product of the BTC and the coefficient
; ((1LL << 62) - 1 and 2) overflows in a signed sense.
define void @symbolicrdiv_prod_ovfl(ptr %A) {
; CHECK-ALL-LABEL: 'symbolicrdiv_prod_ovfl'
@@ -75,10 +75,10 @@ exit:
; FIXME: DependenceAnalysis currently detects no dependency between the two
; stores, but it does exist. For example,
;
-; memory access | i == 2^61 | i == 2^61 + 2^59 | i == 2^61 + 2^60
+; memory access | i == 2^61 | i == 2^61 + 2^59 | i == 2^61 + 2^60
; -------------------------|-----------|------------------|-------------------
-; A[2*i - 2^62] (offset0) | | A[2^60] | A[2^61]
-; A[-i + 2^62] (offset1) | A[2^61] | | A[2^60]
+; A[2*i - 2^62] (offset0) | | A[2^60] | A[2^61]
+; A[-i + 2^62] (offset1) | A[2^61] | | A[2^60]
;
; The root cause is that the calculation of the differenct between the two
; constants (-2^62 and 2^62) overflows in a signed sense.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
index 1812e17..10e83b7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
@@ -189,15 +189,11 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX10-NEXT: v_mov_b32_e32 v2, s1
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: s_lshr_b32 s1, s1, 24
; GFX10-NEXT: s_lshr_b32 s8, s2, 16
-; GFX10-NEXT: s_and_b32 s9, 0xffff, s2
; GFX10-NEXT: s_lshr_b32 s5, s5, 8
; GFX10-NEXT: v_mov_b32_e32 v5, s0
; GFX10-NEXT: s_lshr_b32 s0, s7, 8
; GFX10-NEXT: v_mov_b32_e32 v6, s6
-; GFX10-NEXT: v_mov_b32_e32 v7, s1
-; GFX10-NEXT: s_lshr_b32 s1, s9, 8
; GFX10-NEXT: v_mov_b32_e32 v8, s5
; GFX10-NEXT: v_mov_b32_e32 v9, s0
; GFX10-NEXT: ds_write_b8 v1, v0
@@ -208,18 +204,22 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX10-NEXT: ds_write_b8 v1, v8 offset:1
; GFX10-NEXT: ds_write_b8 v1, v9 offset:5
; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: v_mov_b32_e32 v3, s2
-; GFX10-NEXT: v_mov_b32_e32 v10, s1
+; GFX10-NEXT: s_lshr_b32 s1, s1, 24
+; GFX10-NEXT: s_and_b32 s9, 0xffff, s2
; GFX10-NEXT: s_lshr_b32 s0, s2, 24
-; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
-; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
-; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
+; GFX10-NEXT: v_mov_b32_e32 v7, s1
+; GFX10-NEXT: s_lshr_b32 s1, s9, 8
+; GFX10-NEXT: v_mov_b32_e32 v3, s2
; GFX10-NEXT: ds_write_b8 v1, v0 offset:10
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_and_b32 s0, 0xffff, s3
-; GFX10-NEXT: s_lshr_b32 s1, s3, 16
+; GFX10-NEXT: v_mov_b32_e32 v10, s1
; GFX10-NEXT: s_lshr_b32 s0, s0, 8
+; GFX10-NEXT: s_lshr_b32 s1, s3, 16
; GFX10-NEXT: v_mov_b32_e32 v2, s3
+; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
+; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
+; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
; GFX10-NEXT: v_mov_b32_e32 v3, s0
; GFX10-NEXT: s_lshr_b32 s0, s3, 24
; GFX10-NEXT: v_mov_b32_e32 v4, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index b33b8a7..4a22a91 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -272,10 +272,6 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
-; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[0:1] offset:64
; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[0:1] offset:80
; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[0:1] offset:96
@@ -288,6 +284,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[0:1] offset:208
; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[0:1] offset:224
; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] offset:240
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
+; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB6_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 74552a5..08e64da 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -3105,22 +3105,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-LABEL: bitcast_v32i32_to_v128i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
@@ -3253,6 +3237,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr41
@@ -3284,14 +3284,13 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB12_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -3523,7 +3522,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; SI-NEXT: s_cbranch_execz .LBB12_4
; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -4317,22 +4315,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-LABEL: bitcast_v32i32_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -4437,6 +4419,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -4542,129 +4540,129 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11
+; VI-NEXT: v_mov_b32_e32 v55, v39
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32]
+; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
-; VI-NEXT: v_mov_b32_e32 v55, v39
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6]
-; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2]
; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27
; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10
; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9
; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9
@@ -5286,6 +5284,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-LABEL: bitcast_v32i32_to_v128i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -5302,9 +5304,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
@@ -5437,7 +5436,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -5493,7 +5491,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(29)
+; GFX9-NEXT: s_waitcnt vmcnt(45)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
@@ -5508,7 +5506,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(31)
+; GFX9-NEXT: s_waitcnt vmcnt(47)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
@@ -5520,149 +5518,147 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9
; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16
; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15
@@ -5670,7 +5666,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12
; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8
@@ -5698,7 +5696,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: s_cbranch_execz .LBB12_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_add_u32_e32 v32, 3, v32
-; GFX9-NEXT: s_waitcnt vmcnt(28)
+; GFX9-NEXT: s_waitcnt vmcnt(44)
; GFX9-NEXT: v_add_u32_e32 v31, 3, v31
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
; GFX9-NEXT: v_add_u32_e32 v30, 3, v30
@@ -6755,7 +6753,11 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80
@@ -6776,10 +6778,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
@@ -7416,7 +7414,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20
@@ -10666,7 +10664,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v16, s32
; GFX11-NEXT: scratch_store_b32 off, v17, s32 offset:4
; GFX11-NEXT: scratch_store_b32 off, v18, s32 offset:8
@@ -11599,7 +11597,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: v_readlane_b32 s35, v16, 3
; GFX11-NEXT: v_readlane_b32 s34, v16, 2
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v16, off, s32
; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:8
@@ -11812,13 +11810,26 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -11979,44 +11990,30 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380
; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB14_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -12025,11 +12022,11 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v9, 0xff, v49
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -12632,7 +12629,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB14_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -12646,8 +12642,8 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49
; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
@@ -13327,13 +13323,25 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -13470,34 +13478,20 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -13983,7 +13977,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_add_u16_e32 v9, 3, v61
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -14561,13 +14554,27 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -14709,34 +14716,20 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -15223,7 +15216,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_add_u16_e32 v9, 3, v61
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -16362,7 +16354,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584
@@ -16395,7 +16387,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT: s_clause 0x12
+; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456
@@ -17336,7 +17328,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36
; GFX11-FAKE16-NEXT: .LBB14_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392
; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396
; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400
@@ -17369,7 +17361,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508
; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT: s_clause 0x12
+; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524
; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528
@@ -18086,24 +18078,13 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_branch .LBB15_3
; SI-NEXT: .LBB15_2:
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v55, v56
; SI-NEXT: v_mov_b32_e32 v42, v46
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -18114,10 +18095,22 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: .LBB15_3: ; %Flow
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_mov_b32_e32 v35, v57
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -18127,7 +18120,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: ; %bb.4: ; %cmp.true
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -18722,13 +18714,13 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3
; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5
; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7
; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9
; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
@@ -18956,11 +18948,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: s_and_b32 s4, s28, 0xff
; VI-NEXT: s_lshl_b32 s5, s29, 8
; VI-NEXT: s_or_b32 s4, s4, s5
@@ -18970,11 +18962,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_lshl_b32 s7, s23, 8
; VI-NEXT: s_lshl_b32 s8, s27, 8
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -18982,6 +18971,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -19190,12 +19181,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: s_branch .LBB15_3
; VI-NEXT: .LBB15_2:
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v44, v56
; VI-NEXT: v_mov_b32_e32 v41, v33
; VI-NEXT: v_mov_b32_e32 v50, v40
@@ -19213,6 +19198,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v54, v53
; VI-NEXT: v_mov_b32_e32 v52, v36
; VI-NEXT: v_mov_b32_e32 v49, v51
@@ -19222,7 +19213,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v51, v41
; VI-NEXT: v_mov_b32_e32 v36, v44
; VI-NEXT: v_mov_b32_e32 v53, v54
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v54, v60
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -19235,7 +19226,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: ; %bb.4: ; %cmp.true
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37
; VI-NEXT: s_add_i32 s28, s28, 3
; VI-NEXT: s_and_b32 s4, s28, 0xff
@@ -19820,8 +19810,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9
; GFX9-NEXT: s_waitcnt vmcnt(5)
@@ -20000,16 +19990,18 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: s_lshl_b32 s6, s19, 8
; GFX9-NEXT: s_lshl_b32 s7, s23, 8
; GFX9-NEXT: s_lshl_b32 s8, s27, 8
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -20036,9 +20028,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -20054,14 +20045,16 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -20073,10 +20066,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_mov_b32_e32 v61, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -20089,10 +20083,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_mov_b32_e32 v37, v0
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -20106,17 +20102,22 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -20132,45 +20133,24 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v35, v62
; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(11)
-; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(10)
; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_mov_b32_e32 v40, v30
; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -20221,18 +20201,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_branch .LBB15_3
; GFX9-NEXT: .LBB15_2:
-; GFX9-NEXT: v_mov_b32_e32 v38, v51
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v33, v43
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -20246,6 +20214,18 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT: v_mov_b32_e32 v38, v51
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v35, v62
; GFX9-NEXT: v_mov_b32_e32 v36, v31
; GFX9-NEXT: v_mov_b32_e32 v40, v30
@@ -20683,7 +20663,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
@@ -20716,7 +20696,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT: s_clause 0x7
+; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
@@ -21573,7 +21553,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-TRUE16-NEXT: .LBB15_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
@@ -21606,7 +21586,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT: s_clause 0x7
+; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
@@ -21624,7 +21604,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
@@ -21657,7 +21637,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT: s_clause 0x7
+; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
@@ -22514,7 +22494,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-FAKE16-NEXT: .LBB15_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
@@ -22547,7 +22527,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT: s_clause 0x7
+; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
@@ -26129,7 +26109,10 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32i32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60
@@ -26146,9 +26129,6 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -26714,7 +26694,7 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16
@@ -29181,7 +29161,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272
@@ -29214,7 +29194,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144
@@ -29247,7 +29227,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16
@@ -30049,7 +30029,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8
@@ -30082,7 +30062,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136
@@ -30115,7 +30095,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264
@@ -30155,7 +30135,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280
@@ -30188,7 +30168,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152
@@ -30221,7 +30201,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24
@@ -30913,7 +30893,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32
; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8
@@ -30946,7 +30926,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116
; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120
; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128
; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132
; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136
@@ -30979,7 +30959,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244
; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248
; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256
; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260
; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264
@@ -34732,7 +34712,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284
@@ -34765,7 +34745,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176
; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172
; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164
; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160
; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156
@@ -34798,7 +34778,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28
@@ -34876,7 +34856,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v185, off, s32
; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8
@@ -34909,7 +34889,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116
; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120
; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132
; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136
@@ -34942,7 +34922,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264
@@ -35000,6 +34980,10 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
; SI-LABEL: bitcast_v32i32_to_v64i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -35016,10 +35000,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr63
@@ -35051,14 +35031,13 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; kill: killed $vgpr39
; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB24_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16
@@ -35103,7 +35082,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
; SI-NEXT: s_cbranch_execz .LBB24_4
; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2
; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1
@@ -35356,7 +35334,7 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -36338,7 +36316,13 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -36370,12 +36354,6 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -36391,7 +36369,6 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
@@ -36608,7 +36585,6 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41
; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -37782,7 +37758,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284
@@ -37815,7 +37791,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176
; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172
; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164
; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160
; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156
@@ -37848,7 +37824,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28
@@ -37926,7 +37902,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v185, off, s32
; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8
@@ -37959,7 +37935,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116
; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120
; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132
; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136
@@ -37992,7 +37968,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264
@@ -40033,22 +40009,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; SI-LABEL: bitcast_v32f32_to_v128i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
@@ -40181,6 +40141,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr41
@@ -40212,14 +40188,13 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB36_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -40451,7 +40426,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; SI-NEXT: s_cbranch_execz .LBB36_4
; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_f32_e32 v31, 1.0, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f32_e32 v32, 1.0, v32
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -41245,22 +41219,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-LABEL: bitcast_v32f32_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -41365,6 +41323,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -41470,129 +41444,129 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11
+; VI-NEXT: v_mov_b32_e32 v55, v39
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32]
+; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
-; VI-NEXT: v_mov_b32_e32 v55, v39
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6]
-; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2]
; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27
; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10
; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9
; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9
@@ -42214,6 +42188,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-LABEL: bitcast_v32f32_to_v128i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -42230,9 +42208,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
@@ -42365,7 +42340,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -42421,7 +42395,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(29)
+; GFX9-NEXT: s_waitcnt vmcnt(45)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
@@ -42436,7 +42410,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(31)
+; GFX9-NEXT: s_waitcnt vmcnt(47)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
@@ -42448,149 +42422,147 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9
; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16
; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15
@@ -42598,7 +42570,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12
; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8
@@ -42626,7 +42600,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: s_cbranch_execz .LBB36_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32
-; GFX9-NEXT: s_waitcnt vmcnt(28)
+; GFX9-NEXT: s_waitcnt vmcnt(44)
; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30
@@ -43666,7 +43640,11 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80
@@ -43687,10 +43665,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
@@ -44310,7 +44284,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20
@@ -44770,27 +44744,11 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 24
-; SI-NEXT: v_lshr_b64 v[17:18], v[48:49], 8
-; SI-NEXT: v_add_f32_e64 v53, s23, 1.0
-; SI-NEXT: v_add_f32_e64 v52, s22, 1.0
; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 24
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 16
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 8
; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v2
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2
@@ -44842,24 +44800,33 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v12
+; SI-NEXT: v_lshr_b64 v[17:18], v[48:49], 8
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v16
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v16
+; SI-NEXT: v_add_f32_e64 v53, s23, 1.0
+; SI-NEXT: v_add_f32_e64 v52, s22, 1.0
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v16
+; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 24
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v21
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v21
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v21
+; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 16
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v26
@@ -44868,6 +44835,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v26
; SI-NEXT: v_add_f32_e64 v41, s21, 1.0
; SI-NEXT: v_add_f32_e64 v40, s20, 1.0
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v26
@@ -44875,6 +44844,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: v_add_f32_e64 v57, s16, 1.0
; SI-NEXT: v_add_f32_e64 v46, s19, 1.0
; SI-NEXT: v_add_f32_e64 v45, s18, 1.0
+; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 8
; SI-NEXT: v_lshr_b64 v[31:32], v[40:41], 16
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -44885,6 +44855,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v30
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; SI-NEXT: v_lshr_b64 v[27:28], v[40:41], 24
; SI-NEXT: v_lshr_b64 v[33:34], v[45:46], 24
; SI-NEXT: v_lshr_b64 v[38:39], v[45:46], 8
@@ -45408,33 +45380,33 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v13, s98
+; SI-NEXT: v_mov_b32_e32 v27, s62
; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v13, s46
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v13, s56
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v13, s58
-; SI-NEXT: v_mov_b32_e32 v27, s62
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v13, s46
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v27, s72
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v13, s56
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v27, s74
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v13, s58
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v27, s76
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v57, s16
@@ -45468,6 +45440,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: v_mov_b32_e32 v3, s6
; SI-NEXT: v_mov_b32_e32 v4, s7
; SI-NEXT: v_readlane_b32 s5, v61, 1
+; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v13, s60
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v27, s78
@@ -45809,17 +45782,16 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xff, v15
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13
; SI-NEXT: v_or_b32_e32 v13, v14, v13
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v17
; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v17
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: v_or_b32_e32 v14, v15, v14
; SI-NEXT: v_or_b32_e32 v13, v13, v14
@@ -46687,6 +46659,10 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; VI-NEXT: ; implicit-def: $sgpr46
; VI-NEXT: s_branch .LBB37_2
; VI-NEXT: .LBB37_4:
+; VI-NEXT: v_mov_b32_e32 v53, s46
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s56
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_readlane_b32 s4, v62, 0
; VI-NEXT: v_mov_b32_e32 v48, s4
@@ -46764,6 +46740,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v39, s4
; VI-NEXT: v_readlane_b32 s4, v62, 26
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s58
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v39, s4
; VI-NEXT: v_readlane_b32 s4, v62, 27
@@ -46841,6 +46820,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; VI-NEXT: v_readlane_b32 s4, v62, 51
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v39, s4
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s60
; VI-NEXT: v_readlane_b32 s4, v62, 52
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v39, s4
@@ -46859,40 +46841,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; VI-NEXT: v_readlane_b32 s4, v62, 57
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v39, s4
-; VI-NEXT: v_mov_b32_e32 v53, s46
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s56
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s58
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s60
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s62
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s72
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s74
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s76
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s78
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s88
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v53, s90
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v31, s16
; VI-NEXT: v_mov_b32_e32 v32, s17
; VI-NEXT: v_mov_b32_e32 v29, s18
@@ -46946,11 +46894,35 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; VI-NEXT: v_mov_b32_e32 v42, s82
; VI-NEXT: v_mov_b32_e32 v37, s81
; VI-NEXT: v_mov_b32_e32 v50, s80
-; VI-NEXT: v_mov_b32_e32 v53, s30
-; VI-NEXT: v_mov_b32_e32 v54, s34
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v39, s36
; VI-NEXT: v_mov_b32_e32 v40, s38
; VI-NEXT: v_mov_b32_e32 v41, s48
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s62
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s72
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s74
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s76
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s78
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s88
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s90
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v53, s30
+; VI-NEXT: v_mov_b32_e32 v54, s34
; VI-NEXT: .LBB37_5: ; %end
; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v34
; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v35
@@ -48123,10 +48095,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX9-NEXT: v_readlane_b32 s4, v62, 49
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v40, s4
-; GFX9-NEXT: v_mov_b32_e32 v49, s52
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v40, s46
-; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -48175,6 +48145,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v40, s94
+; GFX9-NEXT: v_mov_b32_e32 v49, s52
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
@@ -48222,6 +48193,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v54, s55
; GFX9-NEXT: v_mov_b32_e32 v50, s53
; GFX9-NEXT: v_mov_b32_e32 v60, s54
+; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v49, s51
; GFX9-NEXT: v_mov_b32_e32 v59, s50
; GFX9-NEXT: v_mov_b32_e32 v58, s49
@@ -48646,7 +48618,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76
; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84
@@ -48681,7 +48653,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX11-NEXT: v_writelane_b32 v76, s101, 5
; GFX11-NEXT: s_mov_b32 vcc_hi, 0
; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT: s_clause 0x12
+; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64
@@ -49601,7 +49573,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80
; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT: s_clause 0x12
+; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v74, off, s32
; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8
@@ -49663,7 +49635,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; GFX11-NEXT: v_readlane_b32 s31, v75, 1
; GFX11-NEXT: v_readlane_b32 s30, v75, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76
; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80
; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84
@@ -49876,13 +49848,26 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -50043,44 +50028,30 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380
; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB38_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -50089,11 +50060,11 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v9, 0xff, v49
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -50696,7 +50667,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB38_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -50710,8 +50680,8 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49
; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
@@ -51391,13 +51361,25 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -51534,34 +51516,20 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -52047,7 +52015,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_add_u16_e32 v9, 3, v61
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -52625,13 +52592,27 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -52773,34 +52754,20 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -53287,7 +53254,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_add_u16_e32 v9, 3, v61
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -54426,7 +54392,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584
@@ -54459,7 +54425,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT: s_clause 0x12
+; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456
@@ -55400,7 +55366,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36
; GFX11-FAKE16-NEXT: .LBB38_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392
; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396
; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400
@@ -55433,7 +55399,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508
; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT: s_clause 0x12
+; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524
; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528
@@ -56150,24 +56116,13 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_branch .LBB39_3
; SI-NEXT: .LBB39_2:
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v55, v56
; SI-NEXT: v_mov_b32_e32 v42, v46
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -56178,10 +56133,22 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: .LBB39_3: ; %Flow
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_mov_b32_e32 v35, v57
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -56191,7 +56158,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; SI-NEXT: ; %bb.4: ; %cmp.true
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -56786,13 +56752,13 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3
; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5
; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7
; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9
; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
@@ -57020,11 +56986,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: s_and_b32 s4, s28, 0xff
; VI-NEXT: s_lshl_b32 s5, s29, 8
; VI-NEXT: s_or_b32 s4, s4, s5
@@ -57034,11 +57000,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_lshl_b32 s7, s23, 8
; VI-NEXT: s_lshl_b32 s8, s27, 8
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -57046,6 +57009,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -57254,12 +57219,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: s_branch .LBB39_3
; VI-NEXT: .LBB39_2:
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v44, v56
; VI-NEXT: v_mov_b32_e32 v41, v33
; VI-NEXT: v_mov_b32_e32 v50, v40
@@ -57277,6 +57236,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v54, v53
; VI-NEXT: v_mov_b32_e32 v52, v36
; VI-NEXT: v_mov_b32_e32 v49, v51
@@ -57286,7 +57251,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_mov_b32_e32 v51, v41
; VI-NEXT: v_mov_b32_e32 v36, v44
; VI-NEXT: v_mov_b32_e32 v53, v54
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v54, v60
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -57299,7 +57264,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: ; %bb.4: ; %cmp.true
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37
; VI-NEXT: s_add_i32 s28, s28, 3
; VI-NEXT: s_and_b32 s4, s28, 0xff
@@ -57884,8 +57848,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9
; GFX9-NEXT: s_waitcnt vmcnt(5)
@@ -58064,16 +58028,18 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: s_lshl_b32 s6, s19, 8
; GFX9-NEXT: s_lshl_b32 s7, s23, 8
; GFX9-NEXT: s_lshl_b32 s8, s27, 8
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -58100,9 +58066,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -58118,14 +58083,16 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -58137,10 +58104,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_mov_b32_e32 v61, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -58153,10 +58121,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_mov_b32_e32 v37, v0
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -58170,17 +58140,22 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -58196,45 +58171,24 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v35, v62
; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(11)
-; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(10)
; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_mov_b32_e32 v40, v30
; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -58285,18 +58239,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_branch .LBB39_3
; GFX9-NEXT: .LBB39_2:
-; GFX9-NEXT: v_mov_b32_e32 v38, v51
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v33, v43
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -58310,6 +58252,18 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT: v_mov_b32_e32 v38, v51
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v35, v62
; GFX9-NEXT: v_mov_b32_e32 v36, v31
; GFX9-NEXT: v_mov_b32_e32 v40, v30
@@ -58747,7 +58701,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
@@ -58780,7 +58734,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT: s_clause 0x7
+; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
@@ -59637,7 +59591,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-TRUE16-NEXT: .LBB39_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
@@ -59670,7 +59624,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT: s_clause 0x7
+; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
@@ -59688,7 +59642,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
@@ -59721,7 +59675,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT: s_clause 0x7
+; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
@@ -60578,7 +60532,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-FAKE16-NEXT: .LBB39_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
@@ -60611,7 +60565,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT: s_clause 0x7
+; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
@@ -64239,7 +64193,10 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60
@@ -64256,9 +64213,6 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -64824,7 +64778,7 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16
@@ -67291,7 +67245,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272
@@ -67324,7 +67278,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144
@@ -67357,7 +67311,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16
@@ -68159,7 +68113,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8
@@ -68192,7 +68146,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136
@@ -68225,7 +68179,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264
@@ -68265,7 +68219,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280
@@ -68298,7 +68252,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152
@@ -68331,7 +68285,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24
@@ -69023,7 +68977,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32
; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8
@@ -69056,7 +69010,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116
; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120
; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128
; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132
; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136
@@ -69089,7 +69043,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244
; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248
; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256
; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260
; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264
@@ -72813,7 +72767,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284
@@ -72846,7 +72800,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176
; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172
; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164
; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160
; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156
@@ -72879,7 +72833,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28
@@ -72957,7 +72911,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v185, off, s32
; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8
@@ -72990,7 +72944,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116
; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120
; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132
; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136
@@ -73023,7 +72977,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264
@@ -73081,6 +73035,10 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
; SI-LABEL: bitcast_v32f32_to_v64i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -73097,10 +73055,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr63
@@ -73132,14 +73086,13 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; kill: killed $vgpr39
; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB48_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16
@@ -73184,7 +73137,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
; SI-NEXT: s_cbranch_execz .LBB48_4
; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_f32_e32 v31, 1.0, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f32_e32 v32, 1.0, v32
; SI-NEXT: v_add_f32_e32 v2, 1.0, v2
; SI-NEXT: v_add_f32_e32 v1, 1.0, v1
@@ -73437,7 +73389,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -74373,7 +74325,13 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -74405,12 +74363,6 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -74426,7 +74378,6 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
@@ -74643,7 +74594,6 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41
; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -75817,7 +75767,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284
@@ -75850,7 +75800,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176
; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172
; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164
; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160
; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156
@@ -75883,7 +75833,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28
@@ -75961,7 +75911,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v185, off, s32
; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8
@@ -75994,7 +75944,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116
; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120
; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132
; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136
@@ -76027,7 +75977,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264
@@ -77054,22 +77004,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; SI-LABEL: bitcast_v16i64_to_v128i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
@@ -77202,6 +77136,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr41
@@ -77233,14 +77183,13 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB56_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -77501,7 +77450,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc
; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29
; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32
; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24
@@ -78266,22 +78214,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; VI-LABEL: bitcast_v16i64_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -78386,6 +78318,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -78491,129 +78439,129 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11
+; VI-NEXT: v_mov_b32_e32 v55, v39
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32]
+; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
-; VI-NEXT: v_mov_b32_e32 v55, v39
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6]
-; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2]
; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27
; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10
; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9
; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9
@@ -79235,6 +79183,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-LABEL: bitcast_v16i64_to_v128i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -79251,9 +79203,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
@@ -79386,7 +79335,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -79442,7 +79390,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(29)
+; GFX9-NEXT: s_waitcnt vmcnt(45)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
@@ -79457,7 +79405,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(31)
+; GFX9-NEXT: s_waitcnt vmcnt(47)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
@@ -79469,149 +79417,147 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8]
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9
; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16
; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15
@@ -79619,7 +79565,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12
; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8
@@ -79676,7 +79624,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, 0, v28, vcc
; GFX9-NEXT: v_add_co_u32_e32 v29, vcc, 3, v29
; GFX9-NEXT: v_addc_co_u32_e32 v30, vcc, 0, v30, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(28)
+; GFX9-NEXT: s_waitcnt vmcnt(44)
; GFX9-NEXT: v_add_co_u32_e32 v31, vcc, 3, v31
; GFX9-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v32, vcc
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
@@ -80712,7 +80660,11 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80
@@ -80733,10 +80685,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
@@ -81381,7 +81329,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20
@@ -84631,7 +84579,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v16, s32
; GFX11-NEXT: scratch_store_b32 off, v17, s32 offset:4
; GFX11-NEXT: scratch_store_b32 off, v18, s32 offset:8
@@ -85566,7 +85514,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: v_readlane_b32 s35, v16, 3
; GFX11-NEXT: v_readlane_b32 s34, v16, 2
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v16, off, s32
; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:8
@@ -85779,13 +85727,26 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -85946,44 +85907,30 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380
; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB58_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -85992,11 +85939,11 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v9, 0xff, v49
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -86599,7 +86546,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB58_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -86613,8 +86559,8 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49
; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
@@ -87294,13 +87240,25 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -87437,34 +87395,20 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -87950,7 +87894,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_add_u16_e32 v9, 3, v61
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -88528,13 +88471,27 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -88676,34 +88633,20 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -89190,7 +89133,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_add_u16_e32 v9, 3, v61
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -90329,7 +90271,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584
@@ -90362,7 +90304,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT: s_clause 0x12
+; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456
@@ -91303,7 +91245,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36
; GFX11-FAKE16-NEXT: .LBB58_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392
; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396
; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400
@@ -91336,7 +91278,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508
; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT: s_clause 0x12
+; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524
; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528
@@ -92053,24 +91995,13 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_branch .LBB59_3
; SI-NEXT: .LBB59_2:
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v55, v56
; SI-NEXT: v_mov_b32_e32 v42, v46
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -92081,10 +92012,22 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: .LBB59_3: ; %Flow
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_mov_b32_e32 v35, v57
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -92094,7 +92037,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: ; %bb.4: ; %cmp.true
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -92689,13 +92631,13 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3
; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5
; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7
; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9
; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
@@ -92923,11 +92865,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: s_and_b32 s4, s28, 0xff
; VI-NEXT: s_lshl_b32 s5, s29, 8
; VI-NEXT: s_or_b32 s4, s4, s5
@@ -92937,11 +92879,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_lshl_b32 s7, s23, 8
; VI-NEXT: s_lshl_b32 s8, s27, 8
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -92949,6 +92888,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -93157,12 +93098,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: s_branch .LBB59_3
; VI-NEXT: .LBB59_2:
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v44, v56
; VI-NEXT: v_mov_b32_e32 v41, v33
; VI-NEXT: v_mov_b32_e32 v50, v40
@@ -93180,6 +93115,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v54, v53
; VI-NEXT: v_mov_b32_e32 v52, v36
; VI-NEXT: v_mov_b32_e32 v49, v51
@@ -93189,7 +93130,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v51, v41
; VI-NEXT: v_mov_b32_e32 v36, v44
; VI-NEXT: v_mov_b32_e32 v53, v54
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v54, v60
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -93202,7 +93143,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: ; %bb.4: ; %cmp.true
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37
; VI-NEXT: s_add_i32 s28, s28, 3
; VI-NEXT: s_and_b32 s4, s28, 0xff
@@ -93787,8 +93727,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9
; GFX9-NEXT: s_waitcnt vmcnt(5)
@@ -93967,16 +93907,18 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: s_lshl_b32 s6, s19, 8
; GFX9-NEXT: s_lshl_b32 s7, s23, 8
; GFX9-NEXT: s_lshl_b32 s8, s27, 8
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -94003,9 +93945,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -94021,14 +93962,16 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -94040,10 +93983,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_mov_b32_e32 v61, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -94056,10 +94000,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_mov_b32_e32 v37, v0
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -94073,17 +94019,22 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -94099,45 +94050,24 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v35, v62
; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(11)
-; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(10)
; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_mov_b32_e32 v40, v30
; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -94188,18 +94118,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_branch .LBB59_3
; GFX9-NEXT: .LBB59_2:
-; GFX9-NEXT: v_mov_b32_e32 v38, v51
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v33, v43
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -94213,6 +94131,18 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT: v_mov_b32_e32 v38, v51
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v35, v62
; GFX9-NEXT: v_mov_b32_e32 v36, v31
; GFX9-NEXT: v_mov_b32_e32 v40, v30
@@ -94650,7 +94580,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
@@ -94683,7 +94613,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT: s_clause 0x7
+; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
@@ -95540,7 +95470,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-TRUE16-NEXT: .LBB59_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
@@ -95573,7 +95503,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT: s_clause 0x7
+; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
@@ -95591,7 +95521,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
@@ -95624,7 +95554,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT: s_clause 0x7
+; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
@@ -96481,7 +96411,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-FAKE16-NEXT: .LBB59_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
@@ -96514,7 +96444,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT: s_clause 0x7
+; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
@@ -100084,7 +100014,10 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16i64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60
@@ -100101,9 +100034,6 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -100669,7 +100599,7 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16
@@ -103136,7 +103066,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272
@@ -103169,7 +103099,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144
@@ -103202,7 +103132,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16
@@ -104004,7 +103934,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8
@@ -104037,7 +103967,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136
@@ -104070,7 +104000,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264
@@ -104110,7 +104040,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280
@@ -104143,7 +104073,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152
@@ -104176,7 +104106,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24
@@ -104868,7 +104798,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32
; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8
@@ -104901,7 +104831,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116
; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120
; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128
; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132
; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136
@@ -104934,7 +104864,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244
; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248
; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256
; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260
; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264
@@ -108700,7 +108630,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284
@@ -108733,7 +108663,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176
; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172
; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164
; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160
; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156
@@ -108766,7 +108696,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28
@@ -108844,7 +108774,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v185, off, s32
; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8
@@ -108877,7 +108807,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116
; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120
; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132
; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136
@@ -108910,7 +108840,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264
@@ -108968,6 +108898,10 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
; SI-LABEL: bitcast_v16i64_to_v64i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -108984,10 +108918,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: ; implicit-def: $vgpr63
@@ -109019,14 +108949,13 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; kill: killed $vgpr48
; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB68_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16
@@ -109099,7 +109028,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc
; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29
; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32
; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc
; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16
@@ -109322,7 +109250,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -110320,7 +110248,13 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -110352,12 +110286,6 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -110373,7 +110301,6 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
@@ -110590,7 +110517,6 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41
; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -111764,7 +111690,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284
@@ -111797,7 +111723,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176
; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172
; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164
; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160
; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156
@@ -111830,7 +111756,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28
@@ -111908,7 +111834,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v185, off, s32
; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8
@@ -111941,7 +111867,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116
; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120
; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132
; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136
@@ -111974,7 +111900,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264
@@ -112032,22 +111958,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; SI-LABEL: bitcast_v16f64_to_v128i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -112180,6 +112090,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr44
; SI-NEXT: ; implicit-def: $vgpr42
@@ -112211,14 +112137,13 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; kill: killed $vgpr36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB72_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v32, v31, 24
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -112449,7 +112374,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB72_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0
; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0
; SI-NEXT: v_alignbit_b32 v33, v32, v31, 24
@@ -113228,22 +113152,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; VI-LABEL: bitcast_v16f64_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -113346,6 +113254,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr57
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -113448,132 +113372,132 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[31:32]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[29:30]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[25:26]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[23:24]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[21:22]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[19:20]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[17:18]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[15:16]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[13:14]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[11:12]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[9:10]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[7:8]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[31:32]
+; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[29:30]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[25:26]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[23:24]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[21:22]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[19:20]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[17:18]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[11:12]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[7:8]
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6]
-; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2]
; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v28
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v11
; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10
; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10
@@ -114184,6 +114108,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-LABEL: bitcast_v16f64_to_v128i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -114200,9 +114128,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; kill: killed $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr41
@@ -114335,7 +114260,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr48
; GFX9-NEXT: ; kill: killed $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr41
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
@@ -114395,7 +114319,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(31)
+; GFX9-NEXT: s_waitcnt vmcnt(47)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -114408,7 +114332,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(33)
+; GFX9-NEXT: s_waitcnt vmcnt(49)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
@@ -114416,152 +114340,151 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[31:32]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[29:30]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[27:28]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[25:26]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[19:20]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[17:18]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[15:16]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[13:14]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[11:12]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[9:10]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[7:8]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[5:6]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11
+; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[31:32]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[29:30]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[27:28]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[25:26]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[19:20]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[17:18]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[15:16]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[13:14]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[11:12]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[9:10]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[7:8]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[5:6]
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4]
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
@@ -114571,6 +114494,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13
; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v12
; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v11
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v10
; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v9
; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v9
@@ -114599,7 +114523,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB72_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
-; GFX9-NEXT: s_waitcnt vmcnt(30)
+; GFX9-NEXT: s_waitcnt vmcnt(46)
; GFX9-NEXT: v_add_f64 v[31:32], v[31:32], 1.0
; GFX9-NEXT: v_add_f64 v[29:30], v[29:30], 1.0
; GFX9-NEXT: v_add_f64 v[27:28], v[27:28], 1.0
@@ -115628,7 +115552,11 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80
@@ -115649,10 +115577,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
@@ -116272,7 +116196,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20
@@ -117056,6 +116980,11 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: v_mov_b32_e32 v33, s4
; SI-NEXT: v_readlane_b32 s4, v61, 39
; SI-NEXT: v_mov_b32_e32 v30, s4
+; SI-NEXT: v_mov_b32_e32 v29, s46
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v29, s98
; SI-NEXT: v_readlane_b32 s4, v61, 40
; SI-NEXT: v_mov_b32_e32 v34, s4
; SI-NEXT: v_readlane_b32 s4, v61, 41
@@ -117148,6 +117077,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v25, s4
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v29, s96
; SI-NEXT: v_readlane_b32 s4, v62, 0
; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -117204,20 +117137,69 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v25, s4
-; SI-NEXT: v_mov_b32_e32 v29, s46
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v29, s98
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v29, s96
+; SI-NEXT: v_readlane_b32 s4, v62, 14
+; SI-NEXT: v_mov_b32_e32 v60, s4
+; SI-NEXT: v_readlane_b32 s4, v62, 15
+; SI-NEXT: v_mov_b32_e32 v31, s4
+; SI-NEXT: v_readlane_b32 s4, v62, 16
+; SI-NEXT: v_mov_b32_e32 v32, s4
+; SI-NEXT: v_readlane_b32 s4, v62, 17
+; SI-NEXT: v_mov_b32_e32 v18, s5
+; SI-NEXT: v_mov_b32_e32 v46, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 0
+; SI-NEXT: v_readlane_b32 s5, v61, 1
+; SI-NEXT: v_mov_b32_e32 v59, s17
+; SI-NEXT: v_mov_b32_e32 v58, s16
+; SI-NEXT: v_mov_b32_e32 v45, s19
+; SI-NEXT: v_mov_b32_e32 v44, s18
+; SI-NEXT: v_mov_b32_e32 v53, s21
+; SI-NEXT: v_mov_b32_e32 v52, s20
+; SI-NEXT: v_mov_b32_e32 v39, s23
+; SI-NEXT: v_mov_b32_e32 v38, s22
+; SI-NEXT: v_mov_b32_e32 v24, s25
+; SI-NEXT: v_mov_b32_e32 v23, s24
+; SI-NEXT: v_mov_b32_e32 v22, s27
+; SI-NEXT: v_mov_b32_e32 v21, s26
+; SI-NEXT: v_mov_b32_e32 v20, s29
+; SI-NEXT: v_mov_b32_e32 v19, s28
+; SI-NEXT: v_mov_b32_e32 v16, s7
+; SI-NEXT: v_mov_b32_e32 v15, s6
+; SI-NEXT: v_mov_b32_e32 v14, s9
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s86
+; SI-NEXT: v_mov_b32_e32 v13, s8
+; SI-NEXT: v_mov_b32_e32 v12, s11
+; SI-NEXT: v_mov_b32_e32 v11, s10
+; SI-NEXT: v_mov_b32_e32 v10, s13
+; SI-NEXT: v_mov_b32_e32 v9, s12
+; SI-NEXT: v_mov_b32_e32 v8, s15
+; SI-NEXT: v_mov_b32_e32 v7, s14
+; SI-NEXT: v_mov_b32_e32 v6, s41
+; SI-NEXT: v_mov_b32_e32 v5, s40
+; SI-NEXT: v_mov_b32_e32 v4, s43
+; SI-NEXT: v_mov_b32_e32 v3, s42
+; SI-NEXT: v_mov_b32_e32 v2, s45
+; SI-NEXT: v_mov_b32_e32 v1, s44
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v28, s38
+; SI-NEXT: v_mov_b32_e32 v27, s36
+; SI-NEXT: v_mov_b32_e32 v26, s34
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v25, s30
+; SI-NEXT: v_mov_b32_e32 v56, s94
+; SI-NEXT: v_mov_b32_e32 v55, s92
+; SI-NEXT: v_mov_b32_e32 v54, s90
+; SI-NEXT: v_mov_b32_e32 v42, s88
+; SI-NEXT: v_mov_b32_e32 v41, s78
+; SI-NEXT: v_mov_b32_e32 v40, s76
+; SI-NEXT: v_mov_b32_e32 v50, s74
+; SI-NEXT: v_mov_b32_e32 v49, s72
+; SI-NEXT: v_mov_b32_e32 v48, s62
+; SI-NEXT: v_mov_b32_e32 v47, s60
+; SI-NEXT: v_mov_b32_e32 v36, s58
+; SI-NEXT: v_mov_b32_e32 v35, s56
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
@@ -117260,165 +117242,108 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: v_mov_b32_e32 v29, s50
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s4, v62, 14
-; SI-NEXT: v_mov_b32_e32 v60, s4
-; SI-NEXT: v_readlane_b32 s4, v62, 15
-; SI-NEXT: v_mov_b32_e32 v31, s4
-; SI-NEXT: v_readlane_b32 s4, v62, 16
-; SI-NEXT: v_mov_b32_e32 v32, s4
-; SI-NEXT: v_readlane_b32 s4, v62, 17
-; SI-NEXT: v_mov_b32_e32 v18, s5
-; SI-NEXT: v_mov_b32_e32 v46, s4
-; SI-NEXT: v_readlane_b32 s4, v61, 0
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 2
+; SI-NEXT: v_readlane_b32 s5, v61, 3
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 1
-; SI-NEXT: v_readlane_b32 s4, v61, 2
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 4
+; SI-NEXT: v_readlane_b32 s5, v61, 5
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 3
-; SI-NEXT: v_readlane_b32 s4, v61, 4
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 6
+; SI-NEXT: v_readlane_b32 s5, v61, 7
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 5
-; SI-NEXT: v_readlane_b32 s4, v61, 6
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 8
+; SI-NEXT: v_readlane_b32 s5, v61, 9
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 7
-; SI-NEXT: v_readlane_b32 s4, v61, 8
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 10
+; SI-NEXT: v_readlane_b32 s5, v61, 11
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 9
-; SI-NEXT: v_readlane_b32 s4, v61, 10
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 12
+; SI-NEXT: v_readlane_b32 s5, v61, 13
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 11
-; SI-NEXT: v_readlane_b32 s4, v61, 12
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 14
+; SI-NEXT: v_readlane_b32 s5, v61, 15
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 13
-; SI-NEXT: v_readlane_b32 s4, v61, 14
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 16
+; SI-NEXT: v_readlane_b32 s5, v61, 17
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 15
-; SI-NEXT: v_readlane_b32 s4, v61, 16
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 18
+; SI-NEXT: v_readlane_b32 s5, v61, 19
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 17
-; SI-NEXT: v_readlane_b32 s4, v61, 18
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 20
+; SI-NEXT: v_readlane_b32 s5, v61, 21
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 19
-; SI-NEXT: v_readlane_b32 s4, v61, 20
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 22
+; SI-NEXT: v_readlane_b32 s5, v61, 23
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 21
-; SI-NEXT: v_readlane_b32 s4, v61, 22
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 24
+; SI-NEXT: v_readlane_b32 s5, v61, 25
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 23
-; SI-NEXT: v_readlane_b32 s4, v61, 24
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 26
+; SI-NEXT: v_readlane_b32 s5, v61, 27
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 25
-; SI-NEXT: v_readlane_b32 s4, v61, 26
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 28
+; SI-NEXT: v_readlane_b32 s5, v61, 29
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 27
-; SI-NEXT: v_readlane_b32 s4, v61, 28
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 30
+; SI-NEXT: v_readlane_b32 s5, v61, 31
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 29
-; SI-NEXT: v_readlane_b32 s4, v61, 30
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
+; SI-NEXT: v_readlane_b32 s4, v61, 32
+; SI-NEXT: v_readlane_b32 s5, v61, 33
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s48
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: v_readlane_b32 s5, v61, 31
-; SI-NEXT: v_readlane_b32 s4, v61, 32
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v29, s4
-; SI-NEXT: v_mov_b32_e32 v59, s17
-; SI-NEXT: v_mov_b32_e32 v58, s16
-; SI-NEXT: v_mov_b32_e32 v45, s19
-; SI-NEXT: v_mov_b32_e32 v44, s18
-; SI-NEXT: v_mov_b32_e32 v53, s21
-; SI-NEXT: v_mov_b32_e32 v52, s20
-; SI-NEXT: v_mov_b32_e32 v39, s23
-; SI-NEXT: v_mov_b32_e32 v38, s22
-; SI-NEXT: v_mov_b32_e32 v24, s25
-; SI-NEXT: v_mov_b32_e32 v23, s24
-; SI-NEXT: v_mov_b32_e32 v22, s27
-; SI-NEXT: v_mov_b32_e32 v21, s26
-; SI-NEXT: v_mov_b32_e32 v20, s29
-; SI-NEXT: v_mov_b32_e32 v19, s28
-; SI-NEXT: v_mov_b32_e32 v16, s7
-; SI-NEXT: v_mov_b32_e32 v15, s6
-; SI-NEXT: v_mov_b32_e32 v14, s9
-; SI-NEXT: v_mov_b32_e32 v13, s8
-; SI-NEXT: v_mov_b32_e32 v12, s11
-; SI-NEXT: v_mov_b32_e32 v11, s10
-; SI-NEXT: v_mov_b32_e32 v10, s13
-; SI-NEXT: v_mov_b32_e32 v9, s12
-; SI-NEXT: v_mov_b32_e32 v8, s15
-; SI-NEXT: v_mov_b32_e32 v7, s14
-; SI-NEXT: v_mov_b32_e32 v6, s41
-; SI-NEXT: v_mov_b32_e32 v5, s40
-; SI-NEXT: v_mov_b32_e32 v4, s43
-; SI-NEXT: v_mov_b32_e32 v3, s42
-; SI-NEXT: v_mov_b32_e32 v2, s45
-; SI-NEXT: v_mov_b32_e32 v1, s44
-; SI-NEXT: v_mov_b32_e32 v28, s38
-; SI-NEXT: v_mov_b32_e32 v27, s36
-; SI-NEXT: v_mov_b32_e32 v26, s34
-; SI-NEXT: v_mov_b32_e32 v25, s30
-; SI-NEXT: v_mov_b32_e32 v56, s94
-; SI-NEXT: v_mov_b32_e32 v55, s92
-; SI-NEXT: v_mov_b32_e32 v54, s90
-; SI-NEXT: v_mov_b32_e32 v42, s88
-; SI-NEXT: v_mov_b32_e32 v41, s78
-; SI-NEXT: v_mov_b32_e32 v40, s76
-; SI-NEXT: v_mov_b32_e32 v50, s74
-; SI-NEXT: v_mov_b32_e32 v49, s72
-; SI-NEXT: v_mov_b32_e32 v48, s62
-; SI-NEXT: v_mov_b32_e32 v47, s60
-; SI-NEXT: v_mov_b32_e32 v36, s58
-; SI-NEXT: v_mov_b32_e32 v35, s56
-; SI-NEXT: v_readlane_b32 s5, v61, 33
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: .LBB73_5: ; %end
@@ -118690,6 +118615,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_mov_b32_e32 v35, s4
; VI-NEXT: v_readlane_b32 s4, v62, 11
; VI-NEXT: v_mov_b32_e32 v41, s4
+; VI-NEXT: v_mov_b32_e32 v40, s48
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s38
; VI-NEXT: v_readlane_b32 s4, v62, 12
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v35, s4
@@ -118727,6 +118656,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_mov_b32_e32 v35, s4
; VI-NEXT: v_readlane_b32 s4, v62, 25
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s36
; VI-NEXT: v_mov_b32_e32 v35, s4
; VI-NEXT: v_readlane_b32 s4, v62, 26
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
@@ -118764,6 +118696,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_readlane_b32 s4, v62, 37
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v35, s4
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s34
; VI-NEXT: v_readlane_b32 s4, v62, 38
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v35, s4
@@ -118779,52 +118714,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_readlane_b32 s4, v62, 42
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v35, s4
-; VI-NEXT: v_mov_b32_e32 v40, s48
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s38
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s36
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s34
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s30
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s90
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s88
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s78
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s76
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s74
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s72
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s62
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s60
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s58
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v40, s56
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: v_readlane_b32 s4, v62, 43
; VI-NEXT: v_mov_b32_e32 v53, s4
; VI-NEXT: v_readlane_b32 s4, v62, 44
@@ -118834,6 +118723,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_readlane_b32 s4, v62, 46
; VI-NEXT: v_mov_b32_e32 v58, s4
; VI-NEXT: v_readlane_b32 s4, v62, 47
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v35, s4
; VI-NEXT: v_readlane_b32 s4, v62, 48
; VI-NEXT: v_mov_b32_e32 v54, s4
@@ -118846,17 +118736,17 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_readlane_b32 s4, v62, 52
; VI-NEXT: v_mov_b32_e32 v39, s4
; VI-NEXT: v_readlane_b32 s4, v62, 53
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s30
; VI-NEXT: v_mov_b32_e32 v49, s4
; VI-NEXT: v_readlane_b32 s4, v62, 54
; VI-NEXT: v_mov_b32_e32 v61, s4
; VI-NEXT: v_readlane_b32 s4, v62, 55
; VI-NEXT: v_mov_b32_e32 v36, s4
; VI-NEXT: v_readlane_b32 s4, v62, 56
-; VI-NEXT: v_mov_b32_e32 v40, s46
; VI-NEXT: v_mov_b32_e32 v55, s4
; VI-NEXT: v_readlane_b32 s4, v62, 57
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v12, s5
; VI-NEXT: v_mov_b32_e32 v1, s44
; VI-NEXT: v_mov_b32_e32 v2, s45
@@ -118886,13 +118776,48 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_mov_b32_e32 v28, s21
; VI-NEXT: v_mov_b32_e32 v29, s18
; VI-NEXT: v_mov_b32_e32 v30, s19
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s90
; VI-NEXT: v_mov_b32_e32 v31, s16
; VI-NEXT: v_mov_b32_e32 v32, s17
; VI-NEXT: v_mov_b32_e32 v42, s70
; VI-NEXT: v_mov_b32_e32 v50, s4
-; VI-NEXT: v_mov_b32_e32 v40, v43
; VI-NEXT: v_mov_b32_e32 v46, v38
; VI-NEXT: v_mov_b32_e32 v38, v34
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s88
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s78
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s76
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s74
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s72
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s62
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s60
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s58
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s56
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, s46
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, v43
; VI-NEXT: .LBB73_5: ; %end
; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v42
; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -119906,6 +119831,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: ; implicit-def: $sgpr46
; GFX9-NEXT: s_branch .LBB73_2
; GFX9-NEXT: .LBB73_4:
+; GFX9-NEXT: v_mov_b32_e32 v41, s66
+; GFX9-NEXT: v_mov_b32_e32 v40, s36
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s34
; GFX9-NEXT: v_mov_b32_e32 v15, s81
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v15, s71
@@ -119982,6 +119913,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v15, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 9
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s30
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v15, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 10
@@ -120040,71 +119975,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: v_readlane_b32 s4, v62, 28
; GFX9-NEXT: v_mov_b32_e32 v29, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 29
-; GFX9-NEXT: v_mov_b32_e32 v41, s66
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v15, s4
-; GFX9-NEXT: v_mov_b32_e32 v40, s36
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s34
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s30
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s94
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s92
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s90
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s88
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s78
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s76
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s74
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s72
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s62
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s60
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s58
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v40, s56
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: v_readlane_b32 s4, v62, 30
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v15, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 31
; GFX9-NEXT: v_mov_b32_e32 v44, s4
@@ -120119,6 +119993,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: v_readlane_b32 s4, v62, 36
; GFX9-NEXT: v_mov_b32_e32 v55, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 37
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s94
; GFX9-NEXT: v_mov_b32_e32 v61, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 38
; GFX9-NEXT: v_mov_b32_e32 v42, s4
@@ -120143,7 +120021,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: v_readlane_b32 s4, v62, 48
; GFX9-NEXT: v_mov_b32_e32 v60, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 49
-; GFX9-NEXT: v_mov_b32_e32 v40, s46
; GFX9-NEXT: v_mov_b32_e32 v12, s5
; GFX9-NEXT: v_mov_b32_e32 v1, s44
; GFX9-NEXT: v_mov_b32_e32 v2, s45
@@ -120181,6 +120058,54 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v54, s64
; GFX9-NEXT: v_mov_b32_e32 v52, s54
; GFX9-NEXT: v_mov_b32_e32 v25, s4
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s92
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s90
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s88
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s78
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s76
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s74
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s72
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s62
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s60
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s58
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s56
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v40, s46
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -120202,6 +120127,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v25, v51, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v45
; GFX9-NEXT: v_or_b32_sdwa v48, v48, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v56
; GFX9-NEXT: v_or_b32_sdwa v50, v50, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v22
@@ -120252,22 +120179,20 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX9-NEXT: v_readlane_b32 s31, v63, 1
; GFX9-NEXT: v_readlane_b32 s30, v63, 0
; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v36
; GFX9-NEXT: v_or_b32_sdwa v27, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v44
; GFX9-NEXT: v_or_b32_sdwa v28, v28, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29
; GFX9-NEXT: v_or_b32_sdwa v29, v19, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v30
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v20, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v51
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v36
; GFX9-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -120599,7 +120524,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84
; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:88
@@ -120634,7 +120559,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX11-NEXT: v_writelane_b32 v77, s101, 5
; GFX11-NEXT: s_mov_b32 vcc_hi, 0
; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT: s_clause 0x13
+; GFX11-NEXT: s_clause 0x13 ; 80-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
@@ -121542,7 +121467,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80
; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT: s_clause 0x13
+; GFX11-NEXT: s_clause 0x13 ; 80-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v75, off, s32
; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:8
@@ -121605,7 +121530,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; GFX11-NEXT: v_readlane_b32 s31, v76, 1
; GFX11-NEXT: v_readlane_b32 s30, v76, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80
; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84
; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:88
@@ -121818,13 +121743,26 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -121985,44 +121923,30 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380
; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB74_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -122031,11 +121955,11 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v9, 0xff, v49
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -122638,7 +122562,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB74_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -122652,8 +122575,8 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49
; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
@@ -123333,13 +123256,25 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -123476,34 +123411,20 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -123989,7 +123910,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_add_u16_e32 v9, 3, v61
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -124567,13 +124487,27 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -124715,34 +124649,20 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -125229,7 +125149,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_add_u16_e32 v9, 3, v61
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -126368,7 +126287,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584
@@ -126401,7 +126320,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT: s_clause 0x12
+; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456
@@ -127342,7 +127261,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36
; GFX11-FAKE16-NEXT: .LBB74_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392
; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396
; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400
@@ -127375,7 +127294,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508
; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT: s_clause 0x12
+; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524
; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528
@@ -128092,24 +128011,13 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_branch .LBB75_3
; SI-NEXT: .LBB75_2:
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v55, v56
; SI-NEXT: v_mov_b32_e32 v42, v46
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -128120,10 +128028,22 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: .LBB75_3: ; %Flow
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_mov_b32_e32 v35, v57
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -128133,7 +128053,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; SI-NEXT: ; %bb.4: ; %cmp.true
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -128728,13 +128647,13 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3
; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5
; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7
; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9
; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
@@ -128962,11 +128881,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: s_and_b32 s4, s28, 0xff
; VI-NEXT: s_lshl_b32 s5, s29, 8
; VI-NEXT: s_or_b32 s4, s4, s5
@@ -128976,11 +128895,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_lshl_b32 s7, s23, 8
; VI-NEXT: s_lshl_b32 s8, s27, 8
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -128988,6 +128904,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -129196,12 +129114,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: s_branch .LBB75_3
; VI-NEXT: .LBB75_2:
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v44, v56
; VI-NEXT: v_mov_b32_e32 v41, v33
; VI-NEXT: v_mov_b32_e32 v50, v40
@@ -129219,6 +129131,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v54, v53
; VI-NEXT: v_mov_b32_e32 v52, v36
; VI-NEXT: v_mov_b32_e32 v49, v51
@@ -129228,7 +129146,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_mov_b32_e32 v51, v41
; VI-NEXT: v_mov_b32_e32 v36, v44
; VI-NEXT: v_mov_b32_e32 v53, v54
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v54, v60
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -129241,7 +129159,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: ; %bb.4: ; %cmp.true
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37
; VI-NEXT: s_add_i32 s28, s28, 3
; VI-NEXT: s_and_b32 s4, s28, 0xff
@@ -129826,8 +129743,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9
; GFX9-NEXT: s_waitcnt vmcnt(5)
@@ -130006,16 +129923,18 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: s_lshl_b32 s6, s19, 8
; GFX9-NEXT: s_lshl_b32 s7, s23, 8
; GFX9-NEXT: s_lshl_b32 s8, s27, 8
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -130042,9 +129961,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -130060,14 +129978,16 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -130079,10 +129999,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_mov_b32_e32 v61, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -130095,10 +130016,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_mov_b32_e32 v37, v0
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -130112,17 +130035,22 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -130138,45 +130066,24 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v35, v62
; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(11)
-; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(10)
; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_mov_b32_e32 v40, v30
; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -130227,18 +130134,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_branch .LBB75_3
; GFX9-NEXT: .LBB75_2:
-; GFX9-NEXT: v_mov_b32_e32 v38, v51
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v33, v43
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -130252,6 +130147,18 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT: v_mov_b32_e32 v38, v51
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v35, v62
; GFX9-NEXT: v_mov_b32_e32 v36, v31
; GFX9-NEXT: v_mov_b32_e32 v40, v30
@@ -130689,7 +130596,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
@@ -130722,7 +130629,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT: s_clause 0x7
+; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
@@ -131579,7 +131486,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-TRUE16-NEXT: .LBB75_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
@@ -131612,7 +131519,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT: s_clause 0x7
+; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
@@ -131630,7 +131537,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
@@ -131663,7 +131570,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT: s_clause 0x7
+; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
@@ -132520,7 +132427,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-FAKE16-NEXT: .LBB75_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
@@ -132553,7 +132460,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT: s_clause 0x7
+; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
@@ -132588,22 +132495,6 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-LABEL: bitcast_v16f64_to_v64bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -132672,6 +132563,22 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; kill: killed $vgpr35
; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr62
; SI-NEXT: ; implicit-def: $vgpr63
@@ -132703,7 +132610,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; kill: killed $vgpr35
; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -132713,7 +132620,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31
; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
@@ -132843,7 +132750,6 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB76_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0
; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0
; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32
@@ -136071,7 +135977,10 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16f64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60
@@ -136088,9 +135997,6 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -136656,7 +136562,7 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16
@@ -139123,7 +139029,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272
@@ -139156,7 +139062,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144
@@ -139189,7 +139095,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16
@@ -139991,7 +139897,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8
@@ -140024,7 +139930,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136
@@ -140057,7 +139963,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264
@@ -140097,7 +140003,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280
@@ -140130,7 +140036,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152
@@ -140163,7 +140069,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24
@@ -140855,7 +140761,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32
; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8
@@ -140888,7 +140794,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116
; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120
; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128
; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132
; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136
@@ -140921,7 +140827,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244
; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248
; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256
; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260
; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264
@@ -140978,22 +140884,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
; SI-LABEL: bitcast_v16f64_to_v64f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -141062,6 +140952,22 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; kill: killed $vgpr35
; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr47
@@ -141093,7 +140999,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; kill: killed $vgpr35
; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -141144,7 +141050,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v8, v52
; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31
; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28
; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22
@@ -141314,7 +141219,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0
; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0
; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0
; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8
; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v9
@@ -144567,7 +144471,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284
@@ -144600,7 +144504,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176
; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172
; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164
; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160
; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156
@@ -144633,7 +144537,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28
@@ -144711,7 +144615,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v185, off, s32
; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8
@@ -144744,7 +144648,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116
; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120
; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132
; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136
@@ -144777,7 +144681,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264
@@ -144835,6 +144739,10 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
; SI-LABEL: bitcast_v16f64_to_v64i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -144851,10 +144759,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr63
@@ -144886,14 +144790,13 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; kill: killed $vgpr48
; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB84_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v33, v32, v31, 16
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16
@@ -144937,7 +144840,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB84_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0
; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0
; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0
@@ -145175,7 +145077,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -146031,7 +145933,13 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -146063,12 +145971,6 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -146084,7 +145986,6 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
@@ -146301,7 +146202,6 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41
; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -147475,7 +147375,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284
@@ -147508,7 +147408,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176
; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172
; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164
; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160
; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156
@@ -147541,7 +147441,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28
@@ -147619,7 +147519,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v185, off, s32
; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8
@@ -147652,7 +147552,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116
; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120
; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132
; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136
@@ -147685,7 +147585,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264
@@ -147895,6 +147795,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264
; SI-NEXT: ; implicit-def: $vgpr11
; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: ; implicit-def: $vgpr9
@@ -147904,7 +147806,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140
@@ -147944,38 +147846,39 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v29
; SI-NEXT: ; implicit-def: $vgpr29
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v2
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200
@@ -147991,11 +147894,12 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v2
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240
-; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -148017,14 +147921,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v2
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316
@@ -148032,11 +147928,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:304
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:300
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296
-; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v1
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v2
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v3
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340
@@ -148045,9 +147945,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:336
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:332
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:328
-; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v1
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v2
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372
@@ -148057,7 +147959,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:368
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:360
-; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v3
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v1
@@ -149940,8 +149842,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5
; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7
; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9
@@ -150037,13 +149939,25 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -150171,14 +150085,19 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -150186,26 +150105,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -150214,35 +150113,57 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: ; implicit-def: $vgpr61
+; VI-NEXT: ; implicit-def: $vgpr57
+; VI-NEXT: ; implicit-def: $vgpr59
+; VI-NEXT: ; implicit-def: $vgpr47
+; VI-NEXT: ; implicit-def: $vgpr45
+; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: ; implicit-def: $vgpr55
; VI-NEXT: ; implicit-def: $vgpr40
; VI-NEXT: ; implicit-def: $vgpr41
; VI-NEXT: ; implicit-def: $vgpr48
; VI-NEXT: ; implicit-def: $vgpr36
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: ; implicit-def: $vgpr49
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -150275,39 +150196,19 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: ; implicit-def: $vgpr53
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(7)
-; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(6)
-; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr61
-; VI-NEXT: ; implicit-def: $vgpr57
-; VI-NEXT: ; implicit-def: $vgpr59
-; VI-NEXT: ; implicit-def: $vgpr47
-; VI-NEXT: ; implicit-def: $vgpr45
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -150473,17 +150374,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr49
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr53
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -151168,8 +151061,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5
; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7
; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9
@@ -151280,13 +151173,27 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108
+; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -151419,14 +151326,19 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3
+; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -151434,26 +151346,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108
-; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -151462,36 +151354,62 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b32 s6, 0x5040100
+; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: s_waitcnt vmcnt(14)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6
+; GFX9-NEXT: s_waitcnt vmcnt(11)
+; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(11)
+; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(10)
+; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: ; implicit-def: $vgpr60
+; GFX9-NEXT: ; implicit-def: $vgpr56
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr46
+; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; implicit-def: $vgpr55
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr38
; GFX9-NEXT: ; implicit-def: $vgpr35
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr48
+; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -151514,49 +151432,25 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
; GFX9-NEXT: ; implicit-def: $vgpr34
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6
; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(7)
-; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(6)
-; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
-; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: ; implicit-def: $vgpr60
-; GFX9-NEXT: ; implicit-def: $vgpr56
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr46
-; GFX9-NEXT: ; implicit-def: $vgpr44
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -151722,17 +151616,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6
; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: ; implicit-def: $vgpr48
-; GFX9-NEXT: ; implicit-def: $vgpr53
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6
@@ -153078,7 +152964,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572
@@ -153111,7 +152997,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456
-; GFX11-FAKE16-NEXT: s_clause 0xf
+; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444
@@ -153940,7 +153826,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100
; GFX11-FAKE16-NEXT: .LBB88_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392
; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396
; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400
@@ -153973,7 +153859,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512
; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516
-; GFX11-FAKE16-NEXT: s_clause 0xf
+; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520
; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524
; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528
@@ -154018,7 +153904,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324
@@ -154029,7 +153914,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304
; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
; SI-NEXT: s_mov_b32 s72, s21
-; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v43, s19, 0
; SI-NEXT: v_writelane_b32 v43, s18, 1
; SI-NEXT: v_writelane_b32 v43, s17, 2
@@ -154070,10 +153955,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v41, s86, 30
; SI-NEXT: v_writelane_b32 v41, s87, 31
; SI-NEXT: v_writelane_b32 v41, s96, 32
+; SI-NEXT: s_mov_b32 s79, s26
; SI-NEXT: v_writelane_b32 v41, s97, 33
; SI-NEXT: v_writelane_b32 v41, s98, 34
; SI-NEXT: v_writelane_b32 v41, s99, 35
-; SI-NEXT: s_mov_b32 s79, s26
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152
; SI-NEXT: v_readfirstlane_b32 s38, v20
; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
; SI-NEXT: v_readfirstlane_b32 s39, v19
@@ -154100,9 +153991,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_readfirstlane_b32 s18, v5
; SI-NEXT: v_readfirstlane_b32 s19, v6
; SI-NEXT: v_readfirstlane_b32 s88, v4
-; SI-NEXT: v_readfirstlane_b32 s89, v3
-; SI-NEXT: v_readfirstlane_b32 s90, v9
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s6, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296
@@ -154110,33 +153999,31 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s4, v32
; SI-NEXT: v_writelane_b32 v43, s4, 4
-; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v33
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272
; SI-NEXT: v_writelane_b32 v43, s4, 5
-; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v34
; SI-NEXT: v_writelane_b32 v43, s4, 6
-; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v35
; SI-NEXT: v_writelane_b32 v43, s4, 7
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s4, v36
; SI-NEXT: v_writelane_b32 v43, s4, 8
-; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s4, v37
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256
; SI-NEXT: v_writelane_b32 v43, s4, 9
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s4, v38
; SI-NEXT: v_writelane_b32 v43, s4, 10
+; SI-NEXT: v_readfirstlane_b32 s89, v3
+; SI-NEXT: v_readfirstlane_b32 s90, v9
; SI-NEXT: v_readfirstlane_b32 s91, v10
; SI-NEXT: v_readfirstlane_b32 s92, v8
; SI-NEXT: v_readfirstlane_b32 s93, v7
@@ -154219,44 +154106,41 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_readfirstlane_b32 s24, v33
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172
; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s78, v34
+; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v35
; SI-NEXT: v_writelane_b32 v43, s4, 18
+; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s4, v36
; SI-NEXT: v_writelane_b32 v43, s4, 19
-; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s4, v37
; SI-NEXT: v_writelane_b32 v43, s4, 20
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: v_writelane_b32 v43, s4, 21
-; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_readfirstlane_b32 s4, v38
; SI-NEXT: v_writelane_b32 v43, s4, 22
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_readfirstlane_b32 s4, v39
; SI-NEXT: v_writelane_b32 v43, s4, 23
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_readfirstlane_b32 s4, v48
; SI-NEXT: v_writelane_b32 v43, s4, 24
-; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_readfirstlane_b32 s4, v49
; SI-NEXT: v_writelane_b32 v43, s4, 25
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_readfirstlane_b32 s4, v50
; SI-NEXT: v_writelane_b32 v43, s4, 26
-; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_readfirstlane_b32 s4, v51
; SI-NEXT: v_writelane_b32 v43, s4, 27
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_readfirstlane_b32 s4, v33
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136
@@ -154270,7 +154154,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104
; SI-NEXT: v_writelane_b32 v43, s4, 28
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_readfirstlane_b32 s4, v52
; SI-NEXT: v_writelane_b32 v43, s4, 29
; SI-NEXT: v_readfirstlane_b32 s4, v53
@@ -154279,7 +154163,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v43, s4, 31
; SI-NEXT: v_readfirstlane_b32 s4, v55
; SI-NEXT: v_writelane_b32 v43, s4, 32
-; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_readfirstlane_b32 s4, v40
; SI-NEXT: v_writelane_b32 v43, s4, 33
; SI-NEXT: v_writelane_b32 v43, s22, 34
@@ -155894,33 +155777,53 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240
; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24
; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124
+; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132
+; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180
+; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188
+; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196
+; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204
+; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220
+; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236
+; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244
+; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252
+; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260
+; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276
+; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284
+; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292
+; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308
+; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316
+; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20
; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22
-; VI-NEXT: s_waitcnt vmcnt(7)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -155965,52 +155868,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68
; VI-NEXT: s_waitcnt vmcnt(10)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124
-; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132
-; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180
-; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188
-; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196
-; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204
-; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220
-; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236
-; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244
-; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252
-; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260
-; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276
-; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284
-; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292
-; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308
-; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316
-; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
@@ -156030,6 +155887,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
@@ -156038,7 +155896,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
@@ -156070,6 +155927,25 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB89_2
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
@@ -156094,15 +155970,18 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v2, v8
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
@@ -156152,10 +156031,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -156163,50 +156043,37 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(9)
-; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(7)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v42, v43
; VI-NEXT: v_mov_b32_e32 v43, v37
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
@@ -156221,13 +156088,12 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -156249,21 +156115,28 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v54, v33
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v56, v1
; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v63, v39
+; VI-NEXT: v_mov_b32_e32 v54, v33
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v57, v0
; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -156281,11 +156154,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v53, v35
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v3, s4, v0
; VI-NEXT: s_and_b32 s4, s16, 0xff
@@ -156318,7 +156190,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: s_branch .LBB89_3
; VI-NEXT: .LBB89_2:
; VI-NEXT: v_mov_b32_e32 v47, v54
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
@@ -156339,6 +156210,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v58, v7
; VI-NEXT: v_mov_b32_e32 v57, v5
; VI-NEXT: v_mov_b32_e32 v56, v3
@@ -156930,29 +156802,51 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224
; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232
; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240
+; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156
+; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172
+; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188
+; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196
+; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204
+; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212
+; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228
+; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244
+; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260
+; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268
+; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292
+; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308
+; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324
; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46
; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT: s_waitcnt vmcnt(7)
+; GFX9-NEXT: s_waitcnt vmcnt(29)
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7
; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
@@ -157016,82 +156910,42 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156
-; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172
-; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196
-; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204
-; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212
-; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228
-; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244
-; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252
-; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260
-; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268
-; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284
-; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292
-; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(22)
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(24)
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(31)
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(35)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(35)
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
@@ -157112,6 +156966,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(55)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB89_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_and_b32 s4, s28, 0xff
@@ -157365,14 +157226,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
@@ -157382,7 +157242,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: .LBB89_2:
; GFX9-NEXT: v_mov_b32_e32 v58, v50
; GFX9-NEXT: v_mov_b32_e32 v45, v59
-; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -157394,6 +157253,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v34, v35
+; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v49, v39
@@ -157859,7 +157719,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64bf16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1e
+; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
@@ -158589,7 +158449,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l
; GFX11-TRUE16-NEXT: .LBB89_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1e
+; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
@@ -158631,7 +158491,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1e
+; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
@@ -159415,7 +159275,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
; GFX11-FAKE16-NEXT: .LBB89_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1e
+; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
@@ -161506,6 +161366,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-LABEL: bitcast_v64bf16_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -161522,9 +161385,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: ; implicit-def: $vgpr35
; VI-NEXT: ; implicit-def: $vgpr45
; VI-NEXT: ; implicit-def: $vgpr34
@@ -161713,166 +161573,165 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v29
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v28
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[15:16]
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v28
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v28
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v27
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[13:14]
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v27
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v26
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v26
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[11:12]
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v26
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v25
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v25
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[9:10]
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v24
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v24
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v24
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[7:8]
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v23
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v22
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[5:6]
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v22
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[3:4]
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v21
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[1:2]
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v19
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[31:32]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v18
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v12
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v18
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v18
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v17
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[11:12]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[7:8]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[5:6]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[3:4]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[1:2]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[31:32]
-; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v12
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v18
; VI-NEXT: v_mov_b32_e32 v45, v46
; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[29:30]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v11
; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v31
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v17
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v46, v63
; VI-NEXT: v_mov_b32_e32 v63, v50
; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[27:28]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13
; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v10
; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v8
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5
; VI-NEXT: v_mov_b32_e32 v51, v57
; VI-NEXT: v_mov_b32_e32 v50, v56
; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[25:26]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5
; VI-NEXT: v_mov_b32_e32 v57, v43
; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[23:24]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4
; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[21:22]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2
; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[19:20]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1
; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[17:18]
; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1
; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v12
; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v12
; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11
@@ -161885,6 +161744,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v6
; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6
; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v6
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v32
; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v32
; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v32
@@ -162518,27 +162378,27 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22]
+; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v28
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18]
-; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v28
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v28
+; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20]
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v28
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v27
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v27
+; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18]
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v26
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v26
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v26
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16
@@ -163282,49 +163142,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v20
-; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v11
-; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v24
-; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23
-; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v23
-; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v22
-; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v22
-; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v21
-; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v21
-; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v20
-; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v19
-; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v17
-; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v17
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v16
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v16
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v16
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v15
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v15
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v14
+; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v18
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
@@ -163338,6 +163160,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v63
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(44)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v62
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v62
@@ -163355,130 +163178,168 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v14
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v14
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v13
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v13
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v12
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v12
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v11
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v6
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v6
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[15:16]
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v9
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v7
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v8
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v6
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2
+; GFX9-NEXT: s_waitcnt vmcnt(35)
+; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[15:16]
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v16
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(35)
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[13:14]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[11:12]
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v16
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[9:10]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[7:8]
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v16
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[5:6]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[3:4]
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v15
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[1:2]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[62:63]
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v15
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[29:30]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[27:28]
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v14
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[25:26]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v14
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[23:24]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v14
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v13
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v13
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[21:22]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v12
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v12
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[19:20]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v10
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10
; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[17:18]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v9
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v7
-; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v22
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v22
+; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v21
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v20
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v17
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
@@ -163571,16 +163432,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc
; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT: v_mov_b32_e32 v59, v32
; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v21
-; GFX9-NEXT: v_mov_b32_e32 v58, v31
; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: v_perm_b32 v14, v13, v0, s7
; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc
@@ -163735,7 +163591,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT: s_waitcnt vmcnt(52)
+; GFX9-NEXT: s_waitcnt vmcnt(50)
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v62
; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; GFX9-NEXT: v_cndmask_b32_e32 v44, v18, v19, vcc
@@ -163750,7 +163606,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
@@ -163891,8 +163746,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v9
; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX9-NEXT: v_mov_b32_e32 v59, v32
; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc
; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX9-NEXT: v_mov_b32_e32 v58, v31
; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6
; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
@@ -163958,6 +163815,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v31
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; GFX9-NEXT: v_bfe_u32 v31, v13, 16, 1
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-NEXT: v_perm_b32 v61, v28, v0, s7
; GFX9-NEXT: v_cndmask_b32_e32 v0, v32, v41, vcc
; GFX9-NEXT: v_add3_u32 v31, v31, v13, s6
@@ -163965,7 +163823,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; GFX9-NEXT: v_cndmask_b32_e32 v13, v31, v32, vcc
; GFX9-NEXT: v_perm_b32 v41, v13, v0, s7
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v16
; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; GFX9-NEXT: v_bfe_u32 v31, v13, 16, 1
@@ -163994,24 +163852,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v15
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; GFX9-NEXT: v_cndmask_b32_e32 v15, v31, v45, vcc
+; GFX9-NEXT: v_perm_b32 v32, v16, v13, s7
; GFX9-NEXT: v_perm_b32 v31, v15, v26, s7
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v16
-; GFX9-NEXT: v_perm_b32 v32, v16, v13, s7
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v13
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26
; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v14
-; GFX9-NEXT: v_perm_b32 v42, v14, v11, s7
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11
; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12
-; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
@@ -164031,12 +163879,19 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; GFX9-NEXT: v_perm_b32 v34, v30, v27, s7
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26
; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v27
; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v25
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: v_perm_b32 v36, v44, v29, s7
; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v29
; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v24
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v23
+; GFX9-NEXT: v_perm_b32 v42, v14, v11, s7
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11
; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v22
; GFX9-NEXT: v_perm_b32 v38, v21, v43, s7
@@ -164045,6 +163900,24 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v20
+; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT: v_perm_b32 v55, v12, v9, s7
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v43
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; GFX9-NEXT: v_perm_b32 v51, v6, v17, s7
; GFX9-NEXT: v_perm_b32 v40, v10, v7, s7
; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7
@@ -164052,12 +163925,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v17
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v19
; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v57
-; GFX9-NEXT: v_perm_b32 v55, v12, v9, s7
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v43
; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v47
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -164101,7 +163970,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v56
; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[31:32]
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
@@ -164134,74 +164002,51 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[35:36]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v32
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[33:34]
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[60:61]
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v32
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[13:14]
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[62:63]
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[43:44]
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59]
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59]
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v32
+; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[33:34]
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v31
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v31
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v42
+; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[60:61]
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v42
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v41
+; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[13:14]
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v55
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v55
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54
+; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[62:63]
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v40
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v39
; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v53
@@ -164214,15 +164059,26 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v37
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v36
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v36
+; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[43:44]
; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v35
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v35
; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v34
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v34
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v33
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v33
+; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59]
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v61
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
@@ -164231,6 +164087,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v60
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v60
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v54
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
@@ -164255,31 +164114,33 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v49
; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v48
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v63, v16
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v35
+; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48
; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT: v_mov_b32_e32 v63, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v54
-; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40
-; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48
; GFX9-NEXT: v_mov_b32_e32 v62, v15
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v35
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v34
-; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v34
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v44
; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v44
-; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v43
; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v43
+; GFX9-NEXT: s_waitcnt vmcnt(24)
+; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59]
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v34
+; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v43
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v59
; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v58
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(19)
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v60
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v61
; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v61
; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v60
@@ -164294,6 +164155,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v10, 8, v54
; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v32
; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v31
; GFX9-NEXT: v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -164302,6 +164167,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v39
; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v8, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v41
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v51
@@ -164310,38 +164177,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v52
; GFX9-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v11
; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v12
-; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v13
; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v14, 8, v14
-; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15
; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v43
; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v60
; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v31
; GFX9-NEXT: v_or_b32_sdwa v18, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -164679,7 +164537,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v128i8:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x2
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v80, off, s32
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:240
@@ -164712,7 +164574,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1b
+; GFX11-TRUE16-NEXT: s_clause 0x1b ; 112-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:112
@@ -164741,10 +164603,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:12
-; GFX11-TRUE16-NEXT: s_clause 0x2
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v80, off, s32
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr152_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
@@ -165778,7 +165636,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:20
@@ -165811,7 +165669,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:136
-; GFX11-TRUE16-NEXT: s_clause 0x1b
+; GFX11-TRUE16-NEXT: s_clause 0x1b ; 112-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:140
; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:144
; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:148
@@ -165846,7 +165704,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x15
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: s_clause 0x15 ; 88-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:96
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:92
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:88
@@ -165869,10 +165731,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:20
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:12
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
@@ -166991,7 +166849,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT: s_clause 0x15
+; GFX11-FAKE16-NEXT: s_clause 0x15 ; 88-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:20
@@ -169924,6 +169782,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_mov_b32_e32 v43, s4
; VI-NEXT: v_readlane_b32 s4, v62, 13
; VI-NEXT: v_mov_b32_e32 v46, s4
+; VI-NEXT: v_mov_b32_e32 v45, s72
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v45, s74
+; VI-NEXT: v_mov_b32_e32 v42, s54
+; VI-NEXT: v_mov_b32_e32 v41, s46
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s56
; VI-NEXT: v_readlane_b32 s4, v62, 14
; VI-NEXT: v_mov_b32_e32 v50, s4
; VI-NEXT: v_readlane_b32 s4, v62, 15
@@ -169949,6 +169816,11 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_readlane_b32 s4, v62, 22
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v45, s76
; VI-NEXT: v_readlane_b32 s4, v62, 23
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v55, s4
@@ -169994,6 +169866,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_readlane_b32 s4, v62, 37
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_readlane_b32 s4, v62, 38
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v55, s4
@@ -170052,45 +169926,47 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v55, s4
; VI-NEXT: v_readlane_b32 s4, v62, 57
-; VI-NEXT: v_mov_b32_e32 v42, s54
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_mov_b32_e32 v41, s46
+; VI-NEXT: v_mov_b32_e32 v36, s66
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v41, s56
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v41, s58
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v41, s60
-; VI-NEXT: v_mov_b32_e32 v45, s72
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v45, s74
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v45, s76
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v45, s78
; VI-NEXT: v_mov_b32_e32 v55, s88
+; VI-NEXT: v_mov_b32_e32 v35, s30
+; VI-NEXT: v_mov_b32_e32 v41, s58
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v36, s66
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v35, s85
+; VI-NEXT: v_mov_b32_e32 v34, s38
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v52, s64
-; VI-NEXT: v_mov_b32_e32 v55, v50
-; VI-NEXT: v_mov_b32_e32 v35, s30
; VI-NEXT: v_mov_b32_e32 v59, s87
+; VI-NEXT: v_mov_b32_e32 v41, s60
+; VI-NEXT: v_mov_b32_e32 v55, v50
; VI-NEXT: v_mov_b32_e32 v58, s34
; VI-NEXT: v_mov_b32_e32 v45, s36
-; VI-NEXT: v_mov_b32_e32 v34, s38
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v50, v46
+; VI-NEXT: v_mov_b32_e32 v46, v48
+; VI-NEXT: v_mov_b32_e32 v48, v47
+; VI-NEXT: v_mov_b32_e32 v47, v56
+; VI-NEXT: v_mov_b32_e32 v56, v51
+; VI-NEXT: v_mov_b32_e32 v51, s90
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v34, s48
; VI-NEXT: v_mov_b32_e32 v1, s44
; VI-NEXT: v_mov_b32_e32 v2, s45
; VI-NEXT: v_mov_b32_e32 v3, s42
@@ -170123,37 +169999,19 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_mov_b32_e32 v30, s29
; VI-NEXT: v_mov_b32_e32 v32, s5
; VI-NEXT: v_mov_b32_e32 v41, s62
+; VI-NEXT: v_mov_b32_e32 v51, v53
+; VI-NEXT: v_mov_b32_e32 v53, v54
+; VI-NEXT: v_mov_b32_e32 v54, v40
+; VI-NEXT: v_mov_b32_e32 v40, s80
; VI-NEXT: v_mov_b32_e32 v57, s81
; VI-NEXT: v_mov_b32_e32 v37, s84
+; VI-NEXT: v_mov_b32_e32 v58, s50
; VI-NEXT: v_mov_b32_e32 v60, s52
; VI-NEXT: v_mov_b32_e32 v38, s51
; VI-NEXT: v_mov_b32_e32 v61, s65
; VI-NEXT: v_mov_b32_e32 v49, s66
-; VI-NEXT: v_mov_b32_e32 v39, s55
-; VI-NEXT: v_mov_b32_e32 v50, v46
-; VI-NEXT: v_mov_b32_e32 v46, v48
-; VI-NEXT: v_mov_b32_e32 v48, v47
-; VI-NEXT: v_mov_b32_e32 v47, v56
-; VI-NEXT: v_mov_b32_e32 v56, v51
-; VI-NEXT: v_mov_b32_e32 v51, s90
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v35, s85
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v34, s48
-; VI-NEXT: v_mov_b32_e32 v51, v53
-; VI-NEXT: v_mov_b32_e32 v53, v54
-; VI-NEXT: v_mov_b32_e32 v54, v40
-; VI-NEXT: v_mov_b32_e32 v40, s80
-; VI-NEXT: v_mov_b32_e32 v58, s50
; VI-NEXT: v_mov_b32_e32 v45, s53
+; VI-NEXT: v_mov_b32_e32 v39, s55
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; VI-NEXT: .LBB91_5: ; %end
@@ -172194,7 +172052,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11-TRUE16-NEXT: s_clause 0x3
+; GFX11-TRUE16-NEXT: s_clause 0x3 ; 16-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:8
@@ -173744,7 +173602,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-TRUE16-NEXT: s_clause 0x3
+; GFX11-TRUE16-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:8
@@ -173757,7 +173615,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11-FAKE16-NEXT: s_clause 0x3
+; GFX11-FAKE16-NEXT: s_clause 0x3 ; 16-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:8
@@ -175314,7 +175172,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-FAKE16-NEXT: s_clause 0x3
+; GFX11-FAKE16-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32
; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:8
@@ -175488,9 +175346,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v38
@@ -175508,6 +175363,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:192
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176
; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr37
@@ -175525,15 +175383,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr21
; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208
@@ -175669,34 +175527,37 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:360
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:368
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:384
-; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:360
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40
@@ -175716,7 +175577,10 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:368
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120
@@ -175726,7 +175590,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216
@@ -175752,14 +175618,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88
-; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80
-; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
@@ -175882,7 +175740,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v2, 0xff, v47
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v6, 0xff, v42
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_and_b32_e32 v18, 0xff, v18
; SI-NEXT: v_and_b32_e32 v22, 0xff, v22
; SI-NEXT: v_and_b32_e32 v24, 0xff, v24
@@ -176540,25 +176397,18 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v20, 0xff, v20
; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16
; SI-NEXT: v_and_b32_e32 v16, 0xff, v16
-; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34
; SI-NEXT: v_and_b32_e32 v34, 0xff, v34
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30
; SI-NEXT: v_and_b32_e32 v30, 0xff, v30
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28
; SI-NEXT: v_and_b32_e32 v28, 0xff, v28
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26
; SI-NEXT: v_and_b32_e32 v26, 0xff, v26
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24
; SI-NEXT: v_and_b32_e32 v24, 0xff, v24
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22
; SI-NEXT: v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18
; SI-NEXT: v_and_b32_e32 v18, 0xff, v18
; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4
@@ -177657,8 +177507,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5
; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7
; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9
@@ -177754,13 +177604,25 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -177888,14 +177750,19 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -177903,26 +177770,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -177931,35 +177778,57 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: ; implicit-def: $vgpr61
+; VI-NEXT: ; implicit-def: $vgpr57
+; VI-NEXT: ; implicit-def: $vgpr59
+; VI-NEXT: ; implicit-def: $vgpr47
+; VI-NEXT: ; implicit-def: $vgpr45
+; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: ; implicit-def: $vgpr55
; VI-NEXT: ; implicit-def: $vgpr40
; VI-NEXT: ; implicit-def: $vgpr41
; VI-NEXT: ; implicit-def: $vgpr48
; VI-NEXT: ; implicit-def: $vgpr36
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: ; implicit-def: $vgpr49
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -177992,39 +177861,19 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: ; implicit-def: $vgpr53
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(7)
-; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(6)
-; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr61
-; VI-NEXT: ; implicit-def: $vgpr57
-; VI-NEXT: ; implicit-def: $vgpr59
-; VI-NEXT: ; implicit-def: $vgpr47
-; VI-NEXT: ; implicit-def: $vgpr45
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -178190,17 +178039,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr49
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr53
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -178885,8 +178726,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5
; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7
; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9
@@ -178997,13 +178838,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108
+; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -179136,14 +178991,19 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3
+; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -179151,26 +179011,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108
-; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -179179,36 +179019,62 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b32 s6, 0x5040100
+; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: s_waitcnt vmcnt(14)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6
+; GFX9-NEXT: s_waitcnt vmcnt(11)
+; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(11)
+; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(10)
+; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: ; implicit-def: $vgpr60
+; GFX9-NEXT: ; implicit-def: $vgpr56
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr46
+; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; implicit-def: $vgpr55
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr38
; GFX9-NEXT: ; implicit-def: $vgpr35
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr48
+; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -179231,49 +179097,25 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
; GFX9-NEXT: ; implicit-def: $vgpr34
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6
; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(7)
-; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(6)
-; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
-; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: ; implicit-def: $vgpr60
-; GFX9-NEXT: ; implicit-def: $vgpr56
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr46
-; GFX9-NEXT: ; implicit-def: $vgpr44
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -179439,17 +179281,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6
; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: ; implicit-def: $vgpr48
-; GFX9-NEXT: ; implicit-def: $vgpr53
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6
@@ -180795,7 +180629,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572
@@ -180828,7 +180662,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456
-; GFX11-FAKE16-NEXT: s_clause 0xf
+; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444
@@ -181657,7 +181491,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100
; GFX11-FAKE16-NEXT: .LBB92_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392
; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396
; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400
@@ -181690,7 +181524,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512
; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516
-; GFX11-FAKE16-NEXT: s_clause 0xf
+; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520
; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524
; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528
@@ -183515,33 +183349,53 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240
; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24
; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124
+; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132
+; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180
+; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188
+; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196
+; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204
+; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220
+; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236
+; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244
+; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252
+; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260
+; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276
+; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284
+; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292
+; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308
+; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316
+; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20
; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22
-; VI-NEXT: s_waitcnt vmcnt(7)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -183586,52 +183440,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68
; VI-NEXT: s_waitcnt vmcnt(10)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124
-; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132
-; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180
-; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188
-; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196
-; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204
-; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220
-; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236
-; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244
-; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252
-; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260
-; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276
-; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284
-; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292
-; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308
-; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316
-; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
@@ -183651,6 +183459,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
@@ -183659,7 +183468,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
@@ -183691,6 +183499,25 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB93_2
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
@@ -183715,15 +183542,18 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v2, v8
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
@@ -183773,10 +183603,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -183784,50 +183615,37 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(9)
-; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(7)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v42, v43
; VI-NEXT: v_mov_b32_e32 v43, v37
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
@@ -183842,13 +183660,12 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -183870,21 +183687,28 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v54, v33
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v56, v1
; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v63, v39
+; VI-NEXT: v_mov_b32_e32 v54, v33
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v57, v0
; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -183902,11 +183726,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v53, v35
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v3, s4, v0
; VI-NEXT: s_and_b32 s4, s16, 0xff
@@ -183939,7 +183762,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: s_branch .LBB93_3
; VI-NEXT: .LBB93_2:
; VI-NEXT: v_mov_b32_e32 v47, v54
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
@@ -183960,6 +183782,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v58, v7
; VI-NEXT: v_mov_b32_e32 v57, v5
; VI-NEXT: v_mov_b32_e32 v56, v3
@@ -184551,29 +184374,51 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224
; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232
; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240
+; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156
+; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172
+; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188
+; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196
+; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204
+; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212
+; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228
+; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244
+; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260
+; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268
+; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292
+; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308
+; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324
; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46
; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT: s_waitcnt vmcnt(7)
+; GFX9-NEXT: s_waitcnt vmcnt(29)
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7
; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
@@ -184637,82 +184482,42 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156
-; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172
-; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196
-; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204
-; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212
-; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228
-; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244
-; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252
-; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260
-; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268
-; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284
-; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292
-; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(22)
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(24)
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(31)
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(35)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(35)
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
@@ -184733,6 +184538,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(55)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB93_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_and_b32 s4, s28, 0xff
@@ -184986,14 +184798,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
@@ -185003,7 +184814,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: .LBB93_2:
; GFX9-NEXT: v_mov_b32_e32 v58, v50
; GFX9-NEXT: v_mov_b32_e32 v45, v59
-; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -185015,6 +184825,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v34, v35
+; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v49, v39
@@ -185480,7 +185291,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1e
+; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
@@ -186210,7 +186021,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l
; GFX11-TRUE16-NEXT: .LBB93_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1e
+; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
@@ -186252,7 +186063,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1e
+; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
@@ -187036,7 +186847,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
; GFX11-FAKE16-NEXT: .LBB93_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1e
+; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
@@ -189098,27 +188909,42 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v4
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v44, v12
; VI-NEXT: v_mov_b32_e32 v12, v0
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32
+; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v32, v20
; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v16
; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v43, v11
; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v10
-; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8
-; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v4
-; VI-NEXT: v_mov_b32_e32 v32, v20
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v55, v22
; VI-NEXT: v_mov_b32_e32 v54, v21
; VI-NEXT: v_mov_b32_e32 v31, v19
+; VI-NEXT: v_mov_b32_e32 v43, v11
; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v44
+; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v43
; VI-NEXT: ; implicit-def: $vgpr20
; VI-NEXT: ; implicit-def: $vgpr57
; VI-NEXT: ; implicit-def: $vgpr51
+; VI-NEXT: ; implicit-def: $vgpr8
+; VI-NEXT: ; implicit-def: $vgpr4
; VI-NEXT: ; implicit-def: $vgpr41
; VI-NEXT: ; implicit-def: $vgpr56
; VI-NEXT: ; implicit-def: $vgpr63
@@ -189130,47 +188956,38 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr42
; VI-NEXT: ; implicit-def: $vgpr45
; VI-NEXT: ; implicit-def: $vgpr52
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v30
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v32
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v15
-; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v43
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31
-; VI-NEXT: ; implicit-def: $vgpr8
-; VI-NEXT: ; implicit-def: $vgpr15
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13
-; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v34
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7
; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v6
; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v15
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13
+; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9
+; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31
+; VI-NEXT: ; implicit-def: $vgpr15
; VI-NEXT: ; implicit-def: $vgpr13
; VI-NEXT: ; implicit-def: $vgpr9
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr5
-; VI-NEXT: ; implicit-def: $vgpr4
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v30
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr30
+; VI-NEXT: ; implicit-def: $vgpr34
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v29
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v28
@@ -189179,38 +188996,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr29
; VI-NEXT: ; implicit-def: $vgpr28
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v27
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v26
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; VI-NEXT: ; implicit-def: $vgpr27
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v25
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v24
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v18
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v34
-; VI-NEXT: ; implicit-def: $vgpr25
-; VI-NEXT: ; implicit-def: $vgpr24
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v23
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v55
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v54
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr23
-; VI-NEXT: ; implicit-def: $vgpr34
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v33
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr0
@@ -189254,8 +189039,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr0
; VI-NEXT: ; kill: killed $vgpr0
; VI-NEXT: ; implicit-def: $vgpr0
+; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; VI-NEXT: ; kill: killed $vgpr0
; VI-NEXT: ; implicit-def: $vgpr0
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr5
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v27
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v26
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; VI-NEXT: ; implicit-def: $vgpr27
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v25
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v24
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr25
+; VI-NEXT: ; implicit-def: $vgpr24
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v23
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v55
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v54
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr23
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr10
@@ -189293,28 +189104,49 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v56, v38
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v45, v7
-; VI-NEXT: v_mov_b32_e32 v63, v53
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v15, v3
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v28, v48
; VI-NEXT: v_mov_b32_e32 v48, v16
; VI-NEXT: v_mov_b32_e32 v16, v40
; VI-NEXT: v_mov_b32_e32 v47, v39
+; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v63, v53
+; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v32
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v31
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v18
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v17
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v29, 24, v44
; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v32
; VI-NEXT: v_lshrrev_b32_e32 v13, 24, v18
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v1
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v1
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v0
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v38
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v37
@@ -189326,83 +189158,20 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v62, v36
-; VI-NEXT: v_lshrrev_b32_e32 v41, 24, v38
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v11
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v10
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT: v_lshrrev_b32_e32 v8, 24, v11
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v6
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v7
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v7
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v52
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v53
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v3
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v53
-; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v2
-; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v59
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v59
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v58
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v26
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_lshrrev_b32_e32 v14, 24, v27
-; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v27
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v33
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v34
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v62, v36
; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v34
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v35
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v36
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v31
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v18
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v17
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[37:38]
@@ -189417,61 +189186,94 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7]
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[2:3]
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[26:27]
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[33:34]
-; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v36
-; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[35:36]
+; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v33
+; VI-NEXT: v_lshrrev_b32_e32 v41, 24, v38
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v50
+; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v49
+; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v40
+; VI-NEXT: v_lshrrev_b32_e32 v8, 24, v11
+; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v53
+; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v53
+; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v52
+; VI-NEXT: v_lshrrev_b32_e32 v14, 24, v27
+; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v34
; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[52:53]
; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[58:59]
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
+; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v27
; VI-NEXT: v_mov_b32_e32 v53, v63
-; VI-NEXT: v_mov_b32_e32 v27, v19
-; VI-NEXT: v_mov_b32_e32 v34, v14
-; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v55
+; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v40
+; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v6
; VI-NEXT: v_mov_b32_e32 v7, v45
; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31
-; VI-NEXT: v_mov_b32_e32 v3, v15
-; VI-NEXT: v_mov_b32_e32 v15, v29
-; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; VI-NEXT: v_mov_b32_e32 v38, v56
-; VI-NEXT: v_mov_b32_e32 v29, v41
; VI-NEXT: v_mov_b32_e32 v45, v60
-; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v55
; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v49
+; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v2
+; VI-NEXT: s_waitcnt vmcnt(13)
+; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v3
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v59
+; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[2:3]
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v59
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v58
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v26
+; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[26:27]
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v34
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v36
+; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v36
+; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v35
; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v50
-; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v50
-; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v40
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[35:36]
; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[49:50]
; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[39:40]
; VI-NEXT: v_mov_b32_e32 v58, v51
+; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[33:34]
; VI-NEXT: v_mov_b32_e32 v36, v62
; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[54:55]
+; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[17:18]
-; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v40
-; VI-NEXT: v_mov_b32_e32 v40, v16
-; VI-NEXT: v_mov_b32_e32 v16, v48
-; VI-NEXT: v_mov_b32_e32 v48, v28
-; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v27, v19
+; VI-NEXT: v_mov_b32_e32 v34, v14
; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v40, v16
+; VI-NEXT: v_mov_b32_e32 v16, v48
+; VI-NEXT: v_mov_b32_e32 v48, v28
+; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v3
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v55
+; VI-NEXT: v_mov_b32_e32 v3, v15
+; VI-NEXT: v_mov_b32_e32 v15, v29
+; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v38, v56
; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v39
+; VI-NEXT: v_mov_b32_e32 v29, v41
; VI-NEXT: v_mov_b32_e32 v39, v47
; VI-NEXT: v_mov_b32_e32 v47, v4
; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v54
+; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v55
; VI-NEXT: .LBB94_2: ; %Flow
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB94_4
; VI-NEXT: ; %bb.3: ; %cmp.true
+; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v63, 0x200
; VI-NEXT: v_add_f16_sdwa v21, v18, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21
@@ -189490,36 +189292,47 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_add_f16_e32 v31, 0x200, v31
; VI-NEXT: v_add_f16_sdwa v23, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_or_b32_e32 v14, v31, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23
; VI-NEXT: v_add_f16_e32 v55, 0x200, v55
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_or_b32_e32 v62, v55, v0
; VI-NEXT: v_add_f16_sdwa v0, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_add_f16_e32 v54, 0x200, v54
; VI-NEXT: v_or_b32_e32 v61, v54, v0
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v26, v54
; VI-NEXT: v_mov_b32_e32 v27, v55
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(10)
; VI-NEXT: v_add_f16_sdwa v60, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v60
; VI-NEXT: v_add_f16_e32 v25, 0x200, v25
; VI-NEXT: v_or_b32_e32 v34, v25, v0
; VI-NEXT: v_add_f16_sdwa v0, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v24, 0x200, v24
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v7, 0x200, v7
+; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v54, 0x200, v54
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v33, v24, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -189527,13 +189340,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v36, v2, v0
; VI-NEXT: v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v1, 0x200, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_e32 v35, v1, v0
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v25, 0x200, v25
+; VI-NEXT: v_add_f16_e32 v24, 0x200, v24
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
@@ -189542,38 +189363,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v38, v2, v0
; VI-NEXT: v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v1, 0x200, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_e32 v37, v1, v0
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_sdwa v0, v9, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v9, 0x200, v9
+; VI-NEXT: v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v8, 0x200, v8
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_or_b32_e32 v49, v9, v0
-; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_add_f16_sdwa v47, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v3, 0x200, v3
+; VI-NEXT: v_or_b32_e32 v49, v9, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; VI-NEXT: v_add_f16_sdwa v1, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v2, 0x200, v2
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v48, v8, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v47
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v9, v31
; VI-NEXT: v_add_f16_sdwa v8, v43, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v10, v32
@@ -189591,11 +189408,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_add_f16_e32 v2, 0x200, v2
; VI-NEXT: v_add_f16_e32 v1, 0x200, v1
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v53, v2, v0
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; VI-NEXT: v_add_f16_sdwa v3, v44, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v44, 0x200, v44
; VI-NEXT: v_or_b32_e32 v52, v1, v0
@@ -189612,28 +189429,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v46, v2, v0
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
; VI-NEXT: v_or_b32_e32 v45, v1, v0
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_f16_sdwa v1, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_f16_e32 v7, 0x200, v7
; VI-NEXT: v_add_f16_e32 v6, 0x200, v6
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v11
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v5, v7, v0
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_e32 v4, v6, v0
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v2, 0x200, v2
+; VI-NEXT: v_add_f16_e32 v1, 0x200, v1
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_add_f16_sdwa v39, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_f16_sdwa v56, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v7, 0x200, v7
; VI-NEXT: v_add_f16_e32 v6, 0x200, v6
@@ -189641,36 +189462,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v41, v7, v0
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39
-; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_e32 v40, v6, v0
-; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_f16_e32 v25, 0x200, v25
-; VI-NEXT: v_add_f16_e32 v24, 0x200, v24
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42
; VI-NEXT: v_or_b32_e32 v7, v25, v0
; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_f16_e32 v2, 0x200, v2
-; VI-NEXT: v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_f16_e32 v1, 0x200, v1
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19
-; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_f16_e32 v54, 0x200, v54
-; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v46
; VI-NEXT: v_or_b32_e32 v6, v24, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3
@@ -189679,7 +189477,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v31, v43, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28
; VI-NEXT: v_or_b32_e32 v30, v2, v0
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_add_f16_sdwa v2, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v55, 0x200, v55
; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -189695,8 +189492,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v1
; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v0
; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
-; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v30
@@ -189714,21 +189509,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v7
+; VI-NEXT: v_mov_b32_e32 v32, v10
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v6
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7]
-; VI-NEXT: v_mov_b32_e32 v32, v10
; VI-NEXT: v_mov_b32_e32 v31, v9
; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v41
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7]
; VI-NEXT: v_mov_b32_e32 v7, v11
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[40:41]
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v55, v27
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v54, v26
; VI-NEXT: v_mov_b32_e32 v26, v20
; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v5
@@ -189736,23 +189531,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_mov_b32_e32 v5, v22
; VI-NEXT: v_mov_b32_e32 v13, v21
; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[45:46]
-; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v53
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v52
; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[50:51]
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v50
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v48
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[48:49]
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v49
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v36
@@ -189760,27 +189546,39 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[35:36]
; VI-NEXT: v_mov_b32_e32 v36, v2
; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v15
-; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v62
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v61
-; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[61:62]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v14
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v53
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v52
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v50
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v49
; VI-NEXT: v_mov_b32_e32 v48, v56
; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v33
; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[33:34]
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[14:15]
; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v58
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v62
+; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v61
+; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[61:62]
; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v57
-; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v9, v23
; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v40
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v14, v8
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v40, v42
; VI-NEXT: v_bfe_u32 v8, v42, 8, 8
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v38
; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v37
; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[37:38]
@@ -189797,26 +189595,24 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_bfe_u32 v51, v48, 8, 8
; VI-NEXT: v_bfe_u32 v57, v7, 8, 8
; VI-NEXT: v_bfe_u32 v58, v60, 8, 8
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_bfe_u32 v34, v62, 8, 8
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_bfe_u32 v2, v2, 8, 8
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; VI-NEXT: v_bfe_u32 v34, v47, 8, 8
; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; VI-NEXT: v_bfe_u32 v9, v9, 8, 8
; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_bfe_u32 v5, v5, 8, 8
; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; VI-NEXT: v_bfe_u32 v13, v13, 8, 8
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(12)
+; VI-NEXT: v_bfe_u32 v2, v2, 8, 8
+; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_bfe_u32 v42, v0, 8, 8
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(7)
+; VI-NEXT: v_bfe_u32 v34, v62, 8, 8
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v34, v47, 8, 8
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_bfe_u32 v0, v0, 8, 8
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
@@ -189986,16 +189782,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v12
@@ -190067,14 +189862,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x54, v12
; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x58, v12
@@ -190082,41 +189876,42 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v63
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x5c, v12
; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v21
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x60, v12
; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v30
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x64, v12
; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x68, v12
@@ -190124,6 +189919,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
@@ -190161,17 +189957,16 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x74, v12
; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v12
@@ -190179,6 +189974,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -190207,22 +190003,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-LABEL: bitcast_v64f16_to_v128i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -190285,6 +190065,23 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
@@ -190315,7 +190112,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr52
; GFX9-NEXT: ; implicit-def: $vgpr51
; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -190349,7 +190145,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(17)
+; GFX9-NEXT: s_waitcnt vmcnt(33)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
@@ -190472,101 +190268,100 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(46)
+; GFX9-NEXT: s_waitcnt vmcnt(62)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
+; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30
+; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30
+; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29
+; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28
+; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
-; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12]
-; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10]
-; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8]
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22]
; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4]
; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20]
; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10
@@ -190582,6 +190377,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19
@@ -190607,7 +190403,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14]
; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0]
-; GFX9-NEXT: s_waitcnt vmcnt(18)
+; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
@@ -191633,7 +191429,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v64f16_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80
@@ -191654,10 +191454,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
@@ -192293,7 +192089,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20
@@ -194483,8 +194279,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v7
; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8]
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v10
@@ -194492,6 +194286,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v9
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v13
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v12
; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[12:13]
@@ -194499,12 +194294,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v12, 8, v1
; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2]
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v16
-; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v19
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4
@@ -194512,14 +194301,20 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[3:4]
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10]
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v16
; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[15:16]
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v19
; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v18
; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[18:19]
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v31
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v15
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v35
; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[34:35]
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
@@ -194554,6 +194349,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_bfe_u32 v11, v52, 8, 8
; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v33
; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v32
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v29
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28
@@ -195713,42 +195509,42 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10]
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12]
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14]
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22]
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v3
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12]
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v6
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v6
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14]
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v5
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v8
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22]
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v8
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v8
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v7
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v26
@@ -196715,7 +196511,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76
; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84
@@ -196750,7 +196546,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: v_writelane_b32 v76, s101, 5
; GFX11-NEXT: s_mov_b32 s99, 0
; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT: s_clause 0x12
+; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64
@@ -197669,7 +197465,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80
; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT: s_clause 0x12
+; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v74, off, s32
; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8
@@ -197731,7 +197527,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: v_readlane_b32 s31, v75, 1
; GFX11-NEXT: v_readlane_b32 s30, v75, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76
; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80
; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84
@@ -197782,11 +197578,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v54, v15
; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v54, v15
; SI-NEXT: v_mov_b32_e32 v57, v5
; SI-NEXT: v_mov_b32_e32 v41, v3
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:392
@@ -197876,7 +197672,30 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160
+; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v32
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v34
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -197884,28 +197703,21 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v21
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v32
; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v10
+; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: ; implicit-def: $vgpr23
; SI-NEXT: ; implicit-def: $vgpr29
; SI-NEXT: ; implicit-def: $vgpr27
@@ -197913,240 +197725,211 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr21
; SI-NEXT: ; implicit-def: $vgpr18
; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160
+; SI-NEXT: ; implicit-def: $vgpr10
+; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128
-; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v34
-; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v10
-; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v11
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:348
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v4
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:344
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v9
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v11
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:380
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v4
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:376
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v9
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:384
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:368
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v8
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:384
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26
-; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v8
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11
; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76
@@ -198158,15 +197941,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35
; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
@@ -198202,7 +197989,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48
@@ -198682,15 +198469,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v25, v6, v13
; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v6, v6, v5
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
; SI-NEXT: v_alignbit_b32 v7, v25, v5, 16
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_and_b32_e32 v6, 0xffff, v26
; SI-NEXT: v_or_b32_e32 v6, v6, v11
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
; SI-NEXT: v_or_b32_e32 v5, v5, v8
; SI-NEXT: s_waitcnt expcnt(0)
@@ -200009,8 +199796,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5
; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7
; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9
@@ -200106,13 +199893,25 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -200240,14 +200039,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -200255,26 +200059,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -200283,35 +200067,57 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: ; implicit-def: $vgpr61
+; VI-NEXT: ; implicit-def: $vgpr57
+; VI-NEXT: ; implicit-def: $vgpr59
+; VI-NEXT: ; implicit-def: $vgpr47
+; VI-NEXT: ; implicit-def: $vgpr45
+; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: ; implicit-def: $vgpr55
; VI-NEXT: ; implicit-def: $vgpr40
; VI-NEXT: ; implicit-def: $vgpr41
; VI-NEXT: ; implicit-def: $vgpr48
; VI-NEXT: ; implicit-def: $vgpr36
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: ; implicit-def: $vgpr49
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -200344,39 +200150,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: ; implicit-def: $vgpr53
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(7)
-; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(6)
-; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr61
-; VI-NEXT: ; implicit-def: $vgpr57
-; VI-NEXT: ; implicit-def: $vgpr59
-; VI-NEXT: ; implicit-def: $vgpr47
-; VI-NEXT: ; implicit-def: $vgpr45
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -200542,17 +200328,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr49
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: ; implicit-def: $vgpr53
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -201237,8 +201015,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5
; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7
; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9
@@ -201349,13 +201127,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208
; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108
+; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2
@@ -201488,14 +201280,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3
+; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -201503,26 +201300,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108
-; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -201531,36 +201308,62 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b32 s6, 0x5040100
+; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: s_waitcnt vmcnt(14)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6
+; GFX9-NEXT: s_waitcnt vmcnt(11)
+; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(11)
+; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(10)
+; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: ; implicit-def: $vgpr60
+; GFX9-NEXT: ; implicit-def: $vgpr56
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr46
+; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; implicit-def: $vgpr55
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr38
; GFX9-NEXT: ; implicit-def: $vgpr35
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr48
+; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -201583,49 +201386,25 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
; GFX9-NEXT: ; implicit-def: $vgpr34
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6
; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(7)
-; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(6)
-; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
-; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: ; implicit-def: $vgpr60
-; GFX9-NEXT: ; implicit-def: $vgpr56
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr46
-; GFX9-NEXT: ; implicit-def: $vgpr44
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -201791,17 +201570,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6
; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: ; implicit-def: $vgpr48
-; GFX9-NEXT: ; implicit-def: $vgpr53
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6
@@ -203147,7 +202918,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572
@@ -203180,7 +202951,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456
-; GFX11-FAKE16-NEXT: s_clause 0xf
+; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444
@@ -204009,7 +203780,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100
; GFX11-FAKE16-NEXT: .LBB96_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392
; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396
; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400
@@ -204042,7 +203813,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512
; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516
-; GFX11-FAKE16-NEXT: s_clause 0xf
+; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520
; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524
; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528
@@ -204087,7 +203858,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324
@@ -204097,9 +203867,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304
; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
-; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_writelane_b32 v41, s30, 0
-; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v43, s29, 0
; SI-NEXT: v_writelane_b32 v43, s28, 1
; SI-NEXT: v_writelane_b32 v43, s27, 2
@@ -204148,6 +203918,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_writelane_b32 v41, s96, 32
; SI-NEXT: v_writelane_b32 v41, s97, 33
; SI-NEXT: v_writelane_b32 v41, s98, 34
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152
; SI-NEXT: v_readfirstlane_b32 s39, v26
; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
; SI-NEXT: v_readfirstlane_b32 s47, v12
@@ -204170,9 +203946,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s59, v28
; SI-NEXT: v_readfirstlane_b32 s60, v27
; SI-NEXT: v_readfirstlane_b32 s11, v1
-; SI-NEXT: v_readfirstlane_b32 s12, v2
-; SI-NEXT: v_readfirstlane_b32 s13, v9
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: v_writelane_b32 v43, s4, 14
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300
@@ -204181,30 +203955,28 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s4, v32
; SI-NEXT: v_writelane_b32 v43, s4, 15
-; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v33
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272
; SI-NEXT: v_writelane_b32 v43, s4, 16
-; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v34
; SI-NEXT: v_writelane_b32 v43, s4, 17
-; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v35
; SI-NEXT: v_writelane_b32 v43, s4, 18
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s44, v36
-; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s90, v37
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s6, v38
+; SI-NEXT: v_readfirstlane_b32 s12, v2
+; SI-NEXT: v_readfirstlane_b32 s13, v9
; SI-NEXT: v_readfirstlane_b32 s14, v10
; SI-NEXT: v_readfirstlane_b32 s15, v8
; SI-NEXT: v_readfirstlane_b32 s18, v7
@@ -204218,6 +203990,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s77, v15
; SI-NEXT: v_readfirstlane_b32 s38, v25
; SI-NEXT: v_writelane_b32 v41, s99, 35
+; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_readfirstlane_b32 s93, v55
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_readfirstlane_b32 s95, v40
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: v_writelane_b32 v43, s4, 19
@@ -204294,39 +204070,35 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_writelane_b32 v43, s4, 30
; SI-NEXT: v_readfirstlane_b32 s4, v32
; SI-NEXT: v_writelane_b32 v43, s4, 31
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v34
; SI-NEXT: v_writelane_b32 v43, s4, 32
+; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s9, v35
-; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s4, v37
; SI-NEXT: v_writelane_b32 v43, s4, 33
; SI-NEXT: v_readfirstlane_b32 s10, v36
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: v_writelane_b32 v43, s4, 34
-; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_readfirstlane_b32 s4, v38
; SI-NEXT: v_writelane_b32 v43, s4, 35
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_readfirstlane_b32 s4, v39
; SI-NEXT: v_writelane_b32 v43, s4, 36
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_readfirstlane_b32 s69, v48
-; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_readfirstlane_b32 s30, v49
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_readfirstlane_b32 s16, v50
-; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_readfirstlane_b32 s36, v51
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_readfirstlane_b32 s4, v33
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136
@@ -204340,7 +204112,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104
; SI-NEXT: v_writelane_b32 v43, s4, 37
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_readfirstlane_b32 s4, v52
; SI-NEXT: v_writelane_b32 v43, s4, 38
; SI-NEXT: v_readfirstlane_b32 s4, v53
@@ -204367,9 +204139,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_writelane_b32 v43, s43, 58
; SI-NEXT: v_writelane_b32 v43, s76, 59
; SI-NEXT: v_writelane_b32 v43, s77, 60
-; SI-NEXT: v_readfirstlane_b32 s93, v55
-; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_readfirstlane_b32 s95, v40
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s17, v33
; SI-NEXT: s_waitcnt vmcnt(9)
@@ -205938,33 +205707,53 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240
; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24
; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124
+; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132
+; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180
+; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188
+; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196
+; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204
+; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220
+; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236
+; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244
+; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252
+; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260
+; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276
+; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284
+; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292
+; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308
+; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316
+; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20
; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22
-; VI-NEXT: s_waitcnt vmcnt(7)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -206009,52 +205798,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68
; VI-NEXT: s_waitcnt vmcnt(10)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124
-; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132
-; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148
-; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164
-; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180
-; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188
-; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196
-; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204
-; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220
-; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236
-; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244
-; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252
-; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260
-; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268
-; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276
-; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284
-; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292
-; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308
-; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316
-; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
@@ -206074,6 +205817,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
@@ -206082,7 +205826,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
@@ -206114,6 +205857,25 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB97_2
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
@@ -206138,15 +205900,18 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v2, v8
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
@@ -206196,10 +205961,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -206207,50 +205973,37 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(9)
-; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(7)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v42, v43
; VI-NEXT: v_mov_b32_e32 v43, v37
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
@@ -206265,13 +206018,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -206293,21 +206045,28 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v54, v33
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v56, v1
; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v63, v39
+; VI-NEXT: v_mov_b32_e32 v54, v33
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v57, v0
; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -206325,11 +206084,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v53, v35
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v3, s4, v0
; VI-NEXT: s_and_b32 s4, s16, 0xff
@@ -206362,7 +206120,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_branch .LBB97_3
; VI-NEXT: .LBB97_2:
; VI-NEXT: v_mov_b32_e32 v47, v54
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
@@ -206383,6 +206140,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v58, v7
; VI-NEXT: v_mov_b32_e32 v57, v5
; VI-NEXT: v_mov_b32_e32 v56, v3
@@ -206974,29 +206732,51 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224
; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232
; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240
+; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156
+; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172
+; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188
+; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196
+; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204
+; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212
+; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228
+; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244
+; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260
+; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268
+; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292
+; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308
+; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324
; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46
; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT: s_waitcnt vmcnt(7)
+; GFX9-NEXT: s_waitcnt vmcnt(29)
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7
; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
@@ -207060,82 +206840,42 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156
-; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164
-; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172
-; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196
-; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204
-; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212
-; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228
-; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244
-; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252
-; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260
-; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268
-; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284
-; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292
-; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(22)
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(24)
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(31)
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(35)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(35)
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
@@ -207156,6 +206896,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(55)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB97_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
; GFX9-NEXT: s_and_b32 s4, s28, 0xff
@@ -207409,14 +207156,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
@@ -207426,7 +207172,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: .LBB97_2:
; GFX9-NEXT: v_mov_b32_e32 v58, v50
; GFX9-NEXT: v_mov_b32_e32 v45, v59
-; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -207438,6 +207183,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v34, v35
+; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
; GFX9-NEXT: v_mov_b32_e32 v49, v39
@@ -207903,7 +207649,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1e
+; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
@@ -208633,7 +208379,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l
; GFX11-TRUE16-NEXT: .LBB97_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1e
+; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
@@ -208675,7 +208421,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16_scalar:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1e
+; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
@@ -209459,7 +209205,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
; GFX11-FAKE16-NEXT: .LBB97_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1e
+; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
@@ -209562,100 +209308,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:132
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28
-; SI-NEXT: ; kill: killed $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr50
-; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr37
-; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v13
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:116
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6
@@ -209785,14 +209437,29 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; kill: killed $vgpr2
@@ -209809,13 +209476,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; kill: killed $vgpr2
@@ -209870,12 +209530,39 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v58
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v13
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19
+; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; implicit-def: $vgpr18
@@ -209885,36 +209572,81 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:116
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112
; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v60
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v59
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v63
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
@@ -209936,6 +209668,18 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v58
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB98_2
@@ -211555,22 +211299,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-LABEL: bitcast_v64i16_to_v128i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32
@@ -211588,6 +211316,22 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9
; VI-NEXT: ; kill: killed $vgpr35
; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7
; VI-NEXT: ; kill: killed $vgpr35
@@ -211884,14 +211628,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v9, v8
; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8]
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v7, v5
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v7, v6
; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6]
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v4
@@ -211923,10 +211665,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: v_mov_b32_e32 v3, v2
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37]
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v5, v4
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v30
@@ -211997,10 +211735,16 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20]
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, v18
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v5, v4
; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22]
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, v46
; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18]
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v26
; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v24
; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v22
@@ -212201,9 +211945,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v15
; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v13, v41, v13
@@ -212211,38 +211952,35 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v13
; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12
; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v11
; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12]
-; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v10
; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v9
; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8
; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7
; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8]
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v6
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v5
; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6]
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3
; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
@@ -212255,8 +211993,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37]
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30
@@ -212325,6 +212061,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: v_mov_b32_e32 v49, v53
; VI-NEXT: v_mov_b32_e32 v53, v38
; VI-NEXT: v_mov_b32_e32 v38, v55
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v18
; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v17
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
@@ -212336,6 +212073,13 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: v_mov_b32_e32 v55, v31
; VI-NEXT: v_bfe_u32 v61, v53, 8, 8
; VI-NEXT: v_bfe_u32 v31, v38, 8, 8
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; VI-NEXT: .LBB98_4: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -212790,22 +212534,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-LABEL: bitcast_v64i16_to_v128i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -212868,6 +212596,23 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
@@ -212898,7 +212643,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr52
; GFX9-NEXT: ; implicit-def: $vgpr51
; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -212932,7 +212676,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(17)
+; GFX9-NEXT: s_waitcnt vmcnt(33)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
@@ -213055,101 +212799,100 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(46)
+; GFX9-NEXT: s_waitcnt vmcnt(62)
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
+; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30
+; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30
+; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29
+; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28
+; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
-; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12]
-; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10]
-; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8]
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22]
; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4]
; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20]
; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10
@@ -213165,6 +212908,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19
@@ -213189,7 +212933,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14]
; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: s_waitcnt vmcnt(18)
+; GFX9-NEXT: s_waitcnt vmcnt(34)
; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
@@ -214215,7 +213959,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x2
+; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80
@@ -214236,10 +213984,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
@@ -214875,7 +214619,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT: s_clause 0x13
+; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload
; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20
@@ -215014,26 +214758,26 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s91, v32
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_readfirstlane_b32 s93, v33
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_readfirstlane_b32 s55, v34
-; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_readfirstlane_b32 s17, v35
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_readfirstlane_b32 s95, v36
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_readfirstlane_b32 s35, v37
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s83, v38
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80
@@ -215046,39 +214790,34 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s39, v1
; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s77, v31
-; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s38, v32
-; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s48, v33
-; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s50, v39
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_readfirstlane_b32 s76, v48
+; SI-NEXT: v_readfirstlane_b32 s77, v31
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s30, v49
+; SI-NEXT: v_readfirstlane_b32 s38, v32
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_readfirstlane_b32 s34, v50
+; SI-NEXT: v_readfirstlane_b32 s48, v33
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_readfirstlane_b32 s36, v51
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_readfirstlane_b32 s99, v34
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_readfirstlane_b32 s50, v39
; SI-NEXT: v_readfirstlane_b32 s90, v35
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_readfirstlane_b32 s92, v36
; SI-NEXT: v_writelane_b32 v41, s90, 11
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_readfirstlane_b32 s94, v37
; SI-NEXT: v_writelane_b32 v41, s92, 12
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_readfirstlane_b32 s30, v49
; SI-NEXT: v_writelane_b32 v41, s94, 13
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_readfirstlane_b32 s34, v50
; SI-NEXT: v_writelane_b32 v41, s30, 14
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_readfirstlane_b32 s36, v51
; SI-NEXT: v_writelane_b32 v41, s34, 15
; SI-NEXT: v_writelane_b32 v41, s36, 16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38
; SI-NEXT: v_writelane_b32 v41, s38, 17
+; SI-NEXT: v_readfirstlane_b32 s76, v48
+; SI-NEXT: v_readfirstlane_b32 s99, v34
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_writelane_b32 v41, s48, 18
; SI-NEXT: v_writelane_b32 v41, s50, 19
@@ -218060,48 +217799,48 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10]
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4
+; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3
; GFX9-NEXT: v_pk_add_u16 v12, s41, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, s40, 3 op_sel_hi:[1,0]
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12]
-; GFX9-NEXT: v_pk_add_u16 v14, s43, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_u16 v13, s42, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14]
-; GFX9-NEXT: v_pk_add_u16 v22, s45, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_u16 v21, s44, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22]
-; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v3
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12]
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v6
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6
+; GFX9-NEXT: v_pk_add_u16 v14, s43, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v13, s42, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v6
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14]
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v5
+; GFX9-NEXT: v_pk_add_u16 v22, s45, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v21, s44, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v8
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22]
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v8
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v8
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v7
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v26
@@ -219068,7 +218807,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76
; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80
; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84
@@ -219103,7 +218842,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: v_writelane_b32 v76, s101, 5
; GFX11-NEXT: s_mov_b32 s99, 0
; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT: s_clause 0x12
+; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64
@@ -220022,7 +219761,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80
; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT: s_clause 0x12
+; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v74, off, s32
; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8
@@ -220084,7 +219823,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: v_readlane_b32 s31, v75, 1
; GFX11-NEXT: v_readlane_b32 s30, v75, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76
; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80
; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84
@@ -221471,6 +221210,8 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
; VI-LABEL: bitcast_v64bf16_to_v64f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -221487,9 +221228,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -221738,7 +221477,6 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30
; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31
; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55
; VI-NEXT: v_bfe_u32 v40, v55, 16, 1
@@ -222104,6 +221842,9 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
; GFX9-LABEL: bitcast_v64bf16_to_v64f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -222120,9 +221861,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(17)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -222341,7 +222080,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(17)
; GFX9-NEXT: v_lshlrev_b32_e32 v55, 16, v31
; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55
; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1
@@ -222641,7 +222380,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60
@@ -223201,7 +222940,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v42 :: v_dual_mov_b32 v11, v43
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, v44 :: v_dual_mov_b32 v13, v45
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v46 :: v_dual_mov_b32 v15, v47
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16
@@ -231398,17 +231137,32 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT: v_alignbit_b32 v5, v23, v5, 16
; SI-NEXT: v_alignbit_b32 v2, v21, v2, 16
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_alignbit_b32 v2, v20, v6, 16
; SI-NEXT: v_alignbit_b32 v1, v61, v1, 16
+; SI-NEXT: v_alignbit_b32 v5, v23, v5, 16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_alignbit_b32 v2, v19, v3, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_alignbit_b32 v1, v18, v4, 16
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
@@ -231418,57 +231172,63 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; SI-NEXT: v_alignbit_b32 v7, v24, v7, 16
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13
; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_alignbit_b32 v25, v45, v8, 16
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v9
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; SI-NEXT: v_alignbit_b32 v8, v25, v8, 16
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13
; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; SI-NEXT: v_alignbit_b32 v62, v63, v16, 16
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33
; SI-NEXT: v_alignbit_b32 v16, v62, v16, 16
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: v_alignbit_b32 v22, v34, v9, 16
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v11
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; SI-NEXT: v_alignbit_b32 v9, v22, v9, 16
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11
@@ -231526,31 +231286,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_alignbit_b32 v15, v41, v15, 16
; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v2, v20, v6, 16
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v2, v19, v3, 16
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v1, v18, v4, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: .LBB104_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -231855,6 +231592,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; VI-LABEL: bitcast_v64bf16_to_v64i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -231871,9 +231610,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -232122,7 +231859,6 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30
; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31
; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55
; VI-NEXT: v_bfe_u32 v40, v55, 16, 1
@@ -232488,6 +232224,9 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; GFX9-LABEL: bitcast_v64bf16_to_v64i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -232504,9 +232243,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(17)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -232725,7 +232462,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(17)
; GFX9-NEXT: v_lshlrev_b32_e32 v55, 16, v31
; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55
; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1
@@ -234330,15 +234067,21 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v57, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34
-; SI-NEXT: v_mov_b32_e32 v57, v13
; SI-NEXT: v_mov_b32_e32 v40, v3
; SI-NEXT: v_mov_b32_e32 v54, v50
; SI-NEXT: v_mov_b32_e32 v46, v19
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9
; SI-NEXT: v_mov_b32_e32 v44, v15
; SI-NEXT: v_mov_b32_e32 v9, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v59
@@ -234372,32 +234115,24 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_mov_b32_e32 v42, v43
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(7) expcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(6) expcnt(1)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19
; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v50
-; SI-NEXT: s_waitcnt vmcnt(2) expcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19
+; SI-NEXT: v_mov_b32_e32 v5, v19
+; SI-NEXT: v_mov_b32_e32 v7, v15
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
-; SI-NEXT: v_mov_b32_e32 v5, v19
-; SI-NEXT: v_mov_b32_e32 v7, v15
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
@@ -234533,9 +234268,7 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v54, v50
-; SI-NEXT: v_mov_b32_e32 v56, v47
; SI-NEXT: v_mov_b32_e32 v9, v11
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v53, v5
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
@@ -234543,6 +234276,8 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v56, v47
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v40, v3
; SI-NEXT: v_mov_b32_e32 v44, v15
; SI-NEXT: v_mov_b32_e32 v57, v13
@@ -234850,16 +234585,18 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; SI-NEXT: v_lshr_b64 v[51:52], v[25:26], 16
; SI-NEXT: v_lshr_b64 v[52:53], v[1:2], 16
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; SI-NEXT: v_alignbit_b32 v16, v45, v16, 16
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
; SI-NEXT: v_alignbit_b32 v28, v58, v27, 16
@@ -234917,19 +234654,14 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16
; SI-NEXT: .LBB105_5: ; %end
-; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -234955,12 +234687,11 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0
@@ -234985,12 +234716,11 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0
@@ -240180,38 +239910,39 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:92
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v9
; SI-NEXT: v_cvt_f16_f32_e32 v43, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v55, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v40, v8
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v5
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v13
-; SI-NEXT: v_cvt_f16_f32_e32 v55, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v2
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v6
; SI-NEXT: v_cvt_f16_f32_e32 v6, v22
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v40, v8
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v9, v14
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v6, v30
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v21
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v12
-; SI-NEXT: v_cvt_f16_f32_e32 v9, v14
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v12, v18
; SI-NEXT: v_cvt_f16_f32_e32 v18, v19
; SI-NEXT: v_cvt_f16_f32_e32 v19, v23
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v23, v25
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v13
; SI-NEXT: v_cvt_f16_f32_e32 v44, v4
; SI-NEXT: v_cvt_f16_f32_e32 v52, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v15
; SI-NEXT: v_cvt_f16_f32_e32 v48, v16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v15
; SI-NEXT: v_cvt_f16_f32_e32 v4, v17
; SI-NEXT: v_cvt_f16_f32_e32 v13, v20
; SI-NEXT: v_cvt_f16_f32_e32 v20, v24
@@ -240222,7 +239953,6 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v31, v27
; SI-NEXT: v_cvt_f16_f32_e32 v25, v50
; SI-NEXT: v_cvt_f16_f32_e32 v27, v29
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v6, v42
; SI-NEXT: v_cvt_f16_f32_e32 v21, v47
; SI-NEXT: v_cvt_f16_f32_e32 v22, v38
@@ -241300,10 +241030,12 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v1, v50
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
@@ -241315,7 +241047,24 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_cvt_f32_f16_e32 v3, v26
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v49
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v54
+; SI-NEXT: v_mov_b32_e32 v54, v15
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v12
+; SI-NEXT: v_mov_b32_e32 v12, v42
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
@@ -241325,8 +241074,13 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v26, v3, v5
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v3, v22
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v49
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
@@ -241335,39 +241089,22 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v22, v3, v5
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v3, v18
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v49
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v54
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v18, v3, v5
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v3, v16
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_mov_b32_e32 v54, v15
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v12
-; SI-NEXT: v_mov_b32_e32 v12, v42
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
@@ -241385,8 +241122,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v14, v3, v5
@@ -241430,11 +241165,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v49
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v50
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
@@ -241571,27 +241301,27 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_or_b32_e32 v12, v50, v1
; SI-NEXT: v_lshr_b64 v[49:50], v[35:36], 16
-; SI-NEXT: v_mov_b32_e32 v35, v44
-; SI-NEXT: v_lshr_b64 v[44:45], v[25:26], 16
; SI-NEXT: v_lshr_b64 v[50:51], v[21:22], 16
-; SI-NEXT: v_lshr_b64 v[24:25], v[17:18], 16
; SI-NEXT: v_lshr_b64 v[20:21], v[42:43], 16
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshr_b64 v[20:21], v[9:10], 16
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v35, v44
+; SI-NEXT: v_lshr_b64 v[44:45], v[25:26], 16
+; SI-NEXT: v_lshr_b64 v[24:25], v[17:18], 16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshr_b64 v[20:21], v[40:41], 16
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16
; SI-NEXT: v_lshr_b64 v[24:25], v[3:4], 16
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshr_b64 v[20:21], v[1:2], 16
; SI-NEXT: v_mov_b32_e32 v42, v61
; SI-NEXT: v_mov_b32_e32 v61, v37
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 9041f64..5b42f95 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -17964,14 +17964,6 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; VI-LABEL: bitcast_v40i8_to_v20i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v34, v10
; VI-NEXT: v_mov_b32_e32 v33, v8
; VI-NEXT: v_mov_b32_e32 v35, v6
@@ -17988,6 +17980,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v31, v14
; VI-NEXT: v_mov_b32_e32 v37, v12
; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1
@@ -18005,17 +18005,15 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29
-; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v6
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -18046,7 +18044,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -18101,14 +18099,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_mov_b32_e32 v1, 0x300
; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(10)
; VI-NEXT: v_add_u16_e32 v0, 3, v54
; VI-NEXT: v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_add_u16_e32 v0, 3, v53
; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_add_u16_e32 v0, 3, v51
; VI-NEXT: v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_e32 v0, 3, v30
@@ -23918,18 +23916,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; SI-LABEL: bitcast_v40i8_to_v20f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v36, v4
; SI-NEXT: v_mov_b32_e32 v31, v2
; SI-NEXT: v_mov_b32_e32 v35, v0
@@ -23943,6 +23929,18 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v1
; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3
; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v5
@@ -23974,20 +23972,16 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: s_waitcnt vmcnt(9) expcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v0
-; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v4
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v32
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v33
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v34
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr32
@@ -24027,7 +24021,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v6, 0xff, v30
; SI-NEXT: v_or_b32_e32 v6, v6, v47
; SI-NEXT: v_cvt_f32_f16_e32 v15, v6
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_and_b32_e32 v6, 0xff, v50
; SI-NEXT: v_or_b32_e32 v6, v6, v56
; SI-NEXT: v_cvt_f32_f16_e32 v32, v6
@@ -24105,18 +24099,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_or_b32_e32 v0, v59, v0
; SI-NEXT: v_add_i32_e32 v19, vcc, 0x300, v0
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: s_movk_i32 s6, 0x300
; SI-NEXT: v_or_b32_e32 v0, v58, v0
; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_or_b32_e32 v0, v57, v0
; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_or_b32_e32 v0, v56, v0
@@ -24232,14 +24225,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; VI-LABEL: bitcast_v40i8_to_v20f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v34, v10
; VI-NEXT: v_mov_b32_e32 v33, v8
; VI-NEXT: v_mov_b32_e32 v35, v6
@@ -24256,6 +24241,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v31, v14
; VI-NEXT: v_mov_b32_e32 v37, v12
; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1
@@ -24273,17 +24266,15 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29
-; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v6
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -24314,7 +24305,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -24369,14 +24360,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_mov_b32_e32 v1, 0x300
; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(10)
; VI-NEXT: v_add_u16_e32 v0, 3, v54
; VI-NEXT: v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_add_u16_e32 v0, 3, v53
; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_add_u16_e32 v0, 3, v51
; VI-NEXT: v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_e32 v0, 3, v30
@@ -28252,15 +28243,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; SI-LABEL: bitcast_v40i8_to_v5f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v36, v10
; SI-NEXT: v_mov_b32_e32 v35, v8
; SI-NEXT: v_mov_b32_e32 v34, v6
@@ -28277,6 +28259,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v38, v14
; SI-NEXT: v_mov_b32_e32 v37, v12
; SI-NEXT: s_waitcnt expcnt(0)
@@ -28295,17 +28286,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25
; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0
-; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -28368,7 +28356,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v8, v25, v8
; SI-NEXT: v_or_b32_e32 v7, v7, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_and_b32_e32 v8, 0xff, v50
; SI-NEXT: v_and_b32_e32 v9, 0xff, v49
; SI-NEXT: v_or_b32_e32 v8, v8, v23
@@ -28508,7 +28496,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v8, v25, v8
; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50
; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49
@@ -28557,15 +28545,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; VI-LABEL: bitcast_v40i8_to_v5f64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v36, v10
; VI-NEXT: v_mov_b32_e32 v35, v8
; VI-NEXT: v_mov_b32_e32 v34, v6
@@ -28582,6 +28561,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v38, v14
; VI-NEXT: v_mov_b32_e32 v37, v12
; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v1
@@ -28599,17 +28587,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v25
; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29
-; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v6
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -28640,7 +28625,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -28748,7 +28733,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_add_u16_e32 v7, 0x300, v7
; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v7, v7, v8
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_add_u16_e32 v8, 3, v50
; VI-NEXT: v_add_u16_e32 v10, 3, v49
; VI-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -28780,15 +28765,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX9-LABEL: bitcast_v40i8_to_v5f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v36, v10
; GFX9-NEXT: v_mov_b32_e32 v35, v8
; GFX9-NEXT: v_mov_b32_e32 v34, v6
@@ -28805,6 +28781,16 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20
; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v38, v14
; GFX9-NEXT: v_mov_b32_e32 v37, v12
; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v1
@@ -28822,17 +28808,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v25
; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29
-; GFX9-NEXT: s_waitcnt vmcnt(9)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0
-; GFX9-NEXT: s_waitcnt vmcnt(8)
+; GFX9-NEXT: s_waitcnt vmcnt(17)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT: s_waitcnt vmcnt(7)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4
-; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: s_waitcnt vmcnt(14)
; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v6
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8
-; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: s_waitcnt vmcnt(12)
; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -28863,7 +28849,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -28971,7 +28957,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7
; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v7, v7, v8
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_add_u16_e32 v8, 3, v50
; GFX9-NEXT: v_add_u16_e32 v9, 3, v49
; GFX9-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -32301,15 +32287,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; SI-LABEL: bitcast_v40i8_to_v5i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v36, v10
; SI-NEXT: v_mov_b32_e32 v35, v8
; SI-NEXT: v_mov_b32_e32 v34, v6
@@ -32326,6 +32303,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v38, v14
; SI-NEXT: v_mov_b32_e32 v37, v12
; SI-NEXT: s_waitcnt expcnt(0)
@@ -32344,17 +32330,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25
; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0
-; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -32417,7 +32400,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v8, v25, v8
; SI-NEXT: v_or_b32_e32 v7, v7, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_and_b32_e32 v8, 0xff, v50
; SI-NEXT: v_and_b32_e32 v9, 0xff, v49
; SI-NEXT: v_or_b32_e32 v8, v8, v23
@@ -32557,7 +32540,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v8, v25, v8
; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50
; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49
@@ -32606,15 +32589,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; VI-LABEL: bitcast_v40i8_to_v5i64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v36, v10
; VI-NEXT: v_mov_b32_e32 v35, v8
; VI-NEXT: v_mov_b32_e32 v34, v6
@@ -32631,6 +32605,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v38, v14
; VI-NEXT: v_mov_b32_e32 v37, v12
; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v1
@@ -32648,17 +32631,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v25
; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29
-; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0
-; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v6
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -32689,7 +32669,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -32797,7 +32777,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; VI-NEXT: v_add_u16_e32 v7, 0x300, v7
; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v7, v7, v8
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_add_u16_e32 v8, 3, v50
; VI-NEXT: v_add_u16_e32 v10, 3, v49
; VI-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -32829,15 +32809,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX9-LABEL: bitcast_v40i8_to_v5i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v36, v10
; GFX9-NEXT: v_mov_b32_e32 v35, v8
; GFX9-NEXT: v_mov_b32_e32 v34, v6
@@ -32854,6 +32825,16 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20
; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v38, v14
; GFX9-NEXT: v_mov_b32_e32 v37, v12
; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v1
@@ -32871,17 +32852,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v25
; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29
-; GFX9-NEXT: s_waitcnt vmcnt(9)
+; GFX9-NEXT: s_waitcnt vmcnt(18)
; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0
-; GFX9-NEXT: s_waitcnt vmcnt(8)
+; GFX9-NEXT: s_waitcnt vmcnt(17)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT: s_waitcnt vmcnt(7)
+; GFX9-NEXT: s_waitcnt vmcnt(16)
; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4
-; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: s_waitcnt vmcnt(14)
; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v6
-; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: s_waitcnt vmcnt(13)
; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8
-; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: s_waitcnt vmcnt(12)
; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -32912,7 +32893,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -33020,7 +33001,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7
; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v7, v7, v8
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: v_add_u16_e32 v8, 3, v50
; GFX9-NEXT: v_add_u16_e32 v9, 3, v49
; GFX9-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index ee23420..c8d1762 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -2406,13 +2406,13 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v32i16_to_v16i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v32, v2
; SI-NEXT: v_mov_b32_e32 v31, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v38, v14
; SI-NEXT: v_mov_b32_e32 v37, v12
; SI-NEXT: v_mov_b32_e32 v36, v10
@@ -2435,9 +2435,9 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -11440,11 +11440,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -11453,6 +11448,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -11484,7 +11484,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v11, 0xff, v52
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v43, v11
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v12, 0xff, v58
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_or_b32_e32 v12, v54, v12
@@ -11723,7 +11722,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v43, v11
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58
; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
@@ -11972,11 +11970,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120
; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19
; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21
; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23
-; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29
; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17
; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -12016,16 +12014,9 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT: s_waitcnt vmcnt(8)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45
; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -12035,6 +12026,13 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68
; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60
; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -12044,11 +12042,10 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -12211,7 +12208,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v15, 0x300
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_add_u16_e32 v9, 3, v40
; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12221,7 +12218,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; VI-NEXT: v_add_u16_e32 v11, 3, v23
; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_add_u16_e32 v12, 3, v38
; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12428,11 +12424,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120
; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19
; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21
; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23
-; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29
; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17
; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -12476,16 +12472,9 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; GFX9-NEXT: s_waitcnt vmcnt(9)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT: s_waitcnt vmcnt(8)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45
; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -12495,6 +12484,13 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68
; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60
; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -12504,11 +12500,10 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -12671,7 +12666,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GFX9-NEXT: s_movk_i32 s6, 0x300
-; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_add_u16_e32 v9, 3, v40
; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12681,7 +12676,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: v_add_u16_e32 v11, 3, v23
; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_add_u16_e32 v12, 3, v38
; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -17323,13 +17317,13 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v32i16_to_v16f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v32, v2
; SI-NEXT: v_mov_b32_e32 v31, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v38, v14
; SI-NEXT: v_mov_b32_e32 v37, v12
; SI-NEXT: v_mov_b32_e32 v36, v10
@@ -17352,9 +17346,9 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -26452,11 +26446,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -26465,6 +26454,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -26496,7 +26490,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v11, 0xff, v52
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v43, v11
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v12, 0xff, v58
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_or_b32_e32 v12, v54, v12
@@ -26735,7 +26728,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v43, v11
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58
; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
@@ -26984,11 +26976,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120
; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19
; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21
; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23
-; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29
; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17
; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -27028,16 +27020,9 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT: s_waitcnt vmcnt(8)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45
; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -27047,6 +27032,13 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68
; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60
; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -27056,11 +27048,10 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -27223,7 +27214,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v15, 0x300
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_add_u16_e32 v9, 3, v40
; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -27233,7 +27224,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; VI-NEXT: v_add_u16_e32 v11, 3, v23
; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_add_u16_e32 v12, 3, v38
; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -27440,11 +27430,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120
; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19
; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21
; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23
-; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29
; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17
; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -27488,16 +27478,9 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; GFX9-NEXT: s_waitcnt vmcnt(9)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT: s_waitcnt vmcnt(8)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45
; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -27507,6 +27490,13 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68
; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60
; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -27516,11 +27506,10 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -27683,7 +27672,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GFX9-NEXT: s_movk_i32 s6, 0x300
-; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_add_u16_e32 v9, 3, v40
; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -27693,7 +27682,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: v_add_u16_e32 v11, 3, v23
; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_add_u16_e32 v12, 3, v38
; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -31688,13 +31676,13 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v32i16_to_v8i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v32, v2
; SI-NEXT: v_mov_b32_e32 v31, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v38, v14
; SI-NEXT: v_mov_b32_e32 v37, v12
; SI-NEXT: v_mov_b32_e32 v36, v10
@@ -31717,9 +31705,9 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -40740,11 +40728,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -40753,6 +40736,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -40784,7 +40772,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v11, 0xff, v52
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v43, v11
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v12, 0xff, v58
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_or_b32_e32 v12, v54, v12
@@ -41023,7 +41010,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v43, v11
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58
; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
@@ -41272,11 +41258,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120
; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19
; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21
; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23
-; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29
; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17
; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -41316,16 +41302,9 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT: s_waitcnt vmcnt(8)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45
; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -41335,6 +41314,13 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68
; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60
; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -41344,11 +41330,10 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -41511,7 +41496,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v15, 0x300
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_add_u16_e32 v9, 3, v40
; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -41521,7 +41506,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; VI-NEXT: v_add_u16_e32 v11, 3, v23
; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_add_u16_e32 v12, 3, v38
; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -41728,11 +41712,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120
; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19
; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21
; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23
-; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29
; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17
; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -41776,16 +41760,9 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; GFX9-NEXT: s_waitcnt vmcnt(9)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT: s_waitcnt vmcnt(8)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45
; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -41795,6 +41772,13 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68
; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60
; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -41804,11 +41788,10 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -41971,7 +41954,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GFX9-NEXT: s_movk_i32 s6, 0x300
-; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_add_u16_e32 v9, 3, v40
; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -41981,7 +41964,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: v_add_u16_e32 v11, 3, v23
; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_add_u16_e32 v12, 3, v38
; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -45317,13 +45299,13 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v32i16_to_v8f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v32, v2
; SI-NEXT: v_mov_b32_e32 v31, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v38, v14
; SI-NEXT: v_mov_b32_e32 v37, v12
; SI-NEXT: v_mov_b32_e32 v36, v10
@@ -45346,9 +45328,9 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -54188,11 +54170,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -54201,6 +54178,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -54232,7 +54214,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v11, 0xff, v52
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v43, v11
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v12, 0xff, v58
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_or_b32_e32 v12, v54, v12
@@ -54471,7 +54452,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v43, v11
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58
; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
@@ -54720,11 +54700,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120
; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19
; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21
; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23
-; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29
; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17
; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -54764,16 +54744,9 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; VI-NEXT: s_waitcnt vmcnt(9)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT: s_waitcnt vmcnt(8)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45
; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -54783,6 +54756,13 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68
; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60
; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -54792,11 +54772,10 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -54959,7 +54938,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v15, 0x300
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_add_u16_e32 v9, 3, v40
; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -54969,7 +54948,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; VI-NEXT: v_add_u16_e32 v11, 3, v23
; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_add_u16_e32 v12, 3, v38
; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -55176,11 +55154,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120
; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19
; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21
; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23
-; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29
; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17
; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -55224,16 +55202,9 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; GFX9-NEXT: s_waitcnt vmcnt(9)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT: s_waitcnt vmcnt(8)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45
; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -55243,6 +55214,13 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68
; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60
; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -55252,11 +55230,10 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -55419,7 +55396,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GFX9-NEXT: s_movk_i32 s6, 0x300
-; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_add_u16_e32 v9, 3, v40
; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -55429,7 +55406,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX9-NEXT: v_add_u16_e32 v11, 3, v23
; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_add_u16_e32 v12, 3, v38
; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -60580,6 +60556,8 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
; SI-LABEL: bitcast_v32bf16_to_v32i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -60596,8 +60574,6 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0
; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1
@@ -60661,9 +60637,8 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr28
; SI-NEXT: ; implicit-def: $vgpr29
; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v55, 1.0, v55
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -64471,44 +64446,44 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v14
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v14
+; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[15:16]
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v13
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v12
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v12
+; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[13:14]
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v11
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v10
+; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12]
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v8
+; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10]
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v8
+; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8]
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v7
+; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6]
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v6
-; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v5
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12]
-; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10]
-; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8]
-; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6]
; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4]
; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v16
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v5
; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v9
; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v4
; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v4
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3
@@ -67768,17 +67743,61 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v21
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v19
; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v27
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr13
; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr19
; SI-NEXT: ; implicit-def: $vgpr21
; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: ; implicit-def: $vgpr23
; SI-NEXT: ; implicit-def: $vgpr54
; SI-NEXT: ; implicit-def: $vgpr27
; SI-NEXT: ; implicit-def: $vgpr55
@@ -67793,25 +67812,24 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v31
; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v32
; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v33
-; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v34
-; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v35
-; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v36
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v37
-; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v38
; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v25
-; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v39
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v48
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v49
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v50
; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr36
@@ -67819,7 +67837,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr25
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
@@ -67833,57 +67850,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12
; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v29
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v49
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v50
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr7
-; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr50
-; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr23
; SI-NEXT: ; implicit-def: $vgpr29
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -67892,7 +67860,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v21, 0xff, v58
; SI-NEXT: v_or_b32_e32 v21, v21, v26
; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21
@@ -68173,7 +68140,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB98_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
@@ -68198,7 +68164,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v3, v59, v3
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_add_i32_e32 v55, vcc, s7, v1
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8
@@ -68222,7 +68187,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_add_i32_e32 v54, vcc, s7, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v57
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -68430,8 +68394,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: .LBB98_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
@@ -68448,6 +68410,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, v37
; SI-NEXT: v_mov_b32_e32 v2, v48
@@ -68458,7 +68422,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_mov_b32_e32 v12, v32
; SI-NEXT: v_mov_b32_e32 v14, v51
; SI-NEXT: v_mov_b32_e32 v16, v34
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_mov_b32_e32 v18, v52
; SI-NEXT: v_mov_b32_e32 v20, v36
; SI-NEXT: v_mov_b32_e32 v22, v53
@@ -70196,13 +70159,12 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v46, v30
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24
-; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44
@@ -70219,6 +70181,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72
; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: v_readfirstlane_b32 s43, v1
; SI-NEXT: v_readfirstlane_b32 s42, v0
; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v3
@@ -70242,19 +70205,19 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36
; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v48
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v39
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v37
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v49
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v30
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v31
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v33
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v34
; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -70280,7 +70243,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0)
; SI-NEXT: v_mov_b32_e32 v60, v44
; SI-NEXT: v_or_b32_e32 v44, v53, v9
; SI-NEXT: v_or_b32_e32 v33, v1, v44
@@ -70725,12 +70688,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: .LBB99_3: ; %end
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
@@ -70747,6 +70704,12 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: s_waitcnt expcnt(0)
@@ -70758,11 +70721,13 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: v_mov_b32_e32 v6, s5
; SI-NEXT: v_mov_b32_e32 v7, s11
; SI-NEXT: v_mov_b32_e32 v8, v37
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_mov_b32_e32 v10, v38
; SI-NEXT: v_mov_b32_e32 v12, v33
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_mov_b32_e32 v14, v34
; SI-NEXT: v_mov_b32_e32 v16, v48
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v18, v49
; SI-NEXT: v_mov_b32_e32 v20, v35
; SI-NEXT: v_mov_b32_e32 v22, v36
@@ -70770,7 +70735,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: v_mov_b32_e32 v26, v51
; SI-NEXT: v_mov_b32_e32 v28, v54
; SI-NEXT: v_mov_b32_e32 v30, v55
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB99_4:
; SI-NEXT: v_mov_b32_e32 v39, v32
@@ -72188,6 +72152,8 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v32f16_to_v32bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -72204,8 +72170,6 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v33, v1
; SI-NEXT: v_cvt_f16_f32_e32 v34, v2
@@ -72273,9 +72237,8 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr28
; SI-NEXT: ; implicit-def: $vgpr29
; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v63, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr31
@@ -79163,13 +79126,12 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: ; implicit-def: $sgpr75
; VI-NEXT: s_branch .LBB105_2
; VI-NEXT: .LBB105_4:
-; VI-NEXT: v_mov_b32_e32 v1, s58
; VI-NEXT: v_mov_b32_e32 v53, s56
; VI-NEXT: v_mov_b32_e32 v52, s42
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v52, s44
+; VI-NEXT: v_mov_b32_e32 v1, s58
; VI-NEXT: v_mov_b32_e32 v19, s67
; VI-NEXT: v_mov_b32_e32 v12, s66
; VI-NEXT: v_mov_b32_e32 v20, s65
@@ -79215,6 +79177,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: v_mov_b32_e32 v45, s78
; VI-NEXT: v_mov_b32_e32 v42, s76
; VI-NEXT: v_mov_b32_e32 v55, s74
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v54, s57
; VI-NEXT: v_mov_b32_e32 v41, s59
; VI-NEXT: v_mov_b32_e32 v44, s60
@@ -80286,6 +80249,14 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9
@@ -80360,19 +80331,10 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v31
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v32
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v36
; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v37
@@ -80390,7 +80352,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v19, 0xff, v55
; SI-NEXT: v_or_b32_e32 v16, v19, v16
; SI-NEXT: v_cvt_f32_f16_e32 v34, v16
@@ -80403,7 +80365,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v12, 0xff, v18
; SI-NEXT: v_or_b32_e32 v10, v12, v10
; SI-NEXT: v_cvt_f32_f16_e32 v21, v10
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v10, 0xff, v41
; SI-NEXT: v_or_b32_e32 v8, v10, v8
; SI-NEXT: v_cvt_f32_f16_e32 v38, v8
@@ -80428,6 +80389,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v0, 0xff, v56
; SI-NEXT: v_or_b32_e32 v0, v0, v3
; SI-NEXT: v_cvt_f32_f16_e32 v29, v0
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v0, 0xff, v6
; SI-NEXT: v_or_b32_e32 v0, v0, v46
; SI-NEXT: v_cvt_f32_f16_e32 v54, v0
@@ -80634,13 +80596,12 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB106_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v56
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6
; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
; SI-NEXT: v_or_b32_e32 v7, v3, v7
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v47
; SI-NEXT: v_or_b32_e32 v6, v46, v6
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
@@ -80648,12 +80609,10 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v9, v35, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v6
; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v7
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v42
; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
; SI-NEXT: v_or_b32_e32 v7, v39, v7
; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v7
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v41
; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
; SI-NEXT: v_or_b32_e32 v7, v8, v7
@@ -80852,13 +80811,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v31, v1
; SI-NEXT: .LBB106_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
@@ -80875,14 +80827,21 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v8, v33
; SI-NEXT: v_mov_b32_e32 v10, v37
; SI-NEXT: v_mov_b32_e32 v12, v49
; SI-NEXT: v_mov_b32_e32 v14, v53
; SI-NEXT: v_mov_b32_e32 v16, v32
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_mov_b32_e32 v18, v34
; SI-NEXT: v_mov_b32_e32 v20, v36
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_mov_b32_e32 v22, v38
; SI-NEXT: v_mov_b32_e32 v24, v48
; SI-NEXT: v_mov_b32_e32 v26, v50
@@ -84461,22 +84420,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; SI-LABEL: bitcast_v32bf16_to_v64i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32
@@ -84542,6 +84485,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: v_mul_f32_e32 v36, 1.0, v2
; SI-NEXT: v_mul_f32_e32 v31, 1.0, v4
; SI-NEXT: v_mul_f32_e32 v35, 1.0, v3
@@ -84605,11 +84564,9 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr58
; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v30, 1.0, v48
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v29, 1.0, v50
; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr50
@@ -90429,6 +90386,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92
; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124
; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v13
; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v21
; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v27
@@ -90458,28 +90417,30 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v12
; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v17
-; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v20
-; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v24
-; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v28
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v31
-; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v32
-; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v33
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v34
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v36
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20
; SI-NEXT: s_waitcnt expcnt(0)
@@ -90496,8 +90457,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
@@ -90513,16 +90472,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v19
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v23
; SI-NEXT: ; kill: killed $vgpr3
@@ -90803,7 +90754,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB110_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v45
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16
@@ -90829,7 +90779,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v5, v58, v5
; SI-NEXT: v_or_b32_e32 v3, v5, v3
; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v3
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v60
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26
@@ -90841,7 +90790,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v5, v46, v5
; SI-NEXT: v_or_b32_e32 v3, v5, v3
; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v3
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v49
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v59
@@ -90854,7 +90802,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v5, v12, v5
; SI-NEXT: v_or_b32_e32 v3, v5, v3
; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v3
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v47
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20
@@ -90868,7 +90815,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v44
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v28
; SI-NEXT: v_or_b32_e32 v3, v5, v3
; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24
@@ -91086,11 +91032,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6
; SI-NEXT: .LBB110_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v2, v43
; SI-NEXT: v_mov_b32_e32 v10, v41
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_mov_b32_e32 v28, v40
; SI-NEXT: v_mov_b32_e32 v30, v42
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
@@ -91109,6 +91052,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v4, v33
; SI-NEXT: v_mov_b32_e32 v6, v39
; SI-NEXT: v_mov_b32_e32 v8, v51
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
index 5d4df4b..7bd2c7a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
@@ -6164,6 +6164,14 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v36f16_to_v18i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -6180,36 +6188,28 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: v_cvt_f16_f32_e32 v35, v1
; SI-NEXT: v_cvt_f16_f32_e32 v33, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: v_cvt_f16_f32_e32 v63, v5
; SI-NEXT: v_cvt_f16_f32_e32 v62, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: v_cvt_f16_f32_e32 v60, v6
; SI-NEXT: v_cvt_f16_f32_e32 v59, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
; SI-NEXT: v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: v_cvt_f16_f32_e32 v46, v12
; SI-NEXT: v_cvt_f16_f32_e32 v45, v15
; SI-NEXT: v_cvt_f16_f32_e32 v44, v14
@@ -6224,14 +6224,12 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v51, v25
; SI-NEXT: v_cvt_f16_f32_e32 v50, v24
; SI-NEXT: v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v36
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v36, v39
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -13435,6 +13433,14 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v36f16_to_v18f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -13451,36 +13457,28 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: v_cvt_f16_f32_e32 v35, v1
; SI-NEXT: v_cvt_f16_f32_e32 v33, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: v_cvt_f16_f32_e32 v63, v5
; SI-NEXT: v_cvt_f16_f32_e32 v62, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: v_cvt_f16_f32_e32 v60, v6
; SI-NEXT: v_cvt_f16_f32_e32 v59, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
; SI-NEXT: v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: v_cvt_f16_f32_e32 v46, v12
; SI-NEXT: v_cvt_f16_f32_e32 v45, v15
; SI-NEXT: v_cvt_f16_f32_e32 v44, v14
@@ -13495,14 +13493,12 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v51, v25
; SI-NEXT: v_cvt_f16_f32_e32 v50, v24
; SI-NEXT: v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v36
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v36, v39
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -19656,6 +19652,14 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v36f16_to_v9i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -19672,36 +19676,28 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: v_cvt_f16_f32_e32 v35, v1
; SI-NEXT: v_cvt_f16_f32_e32 v33, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: v_cvt_f16_f32_e32 v63, v5
; SI-NEXT: v_cvt_f16_f32_e32 v62, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: v_cvt_f16_f32_e32 v60, v6
; SI-NEXT: v_cvt_f16_f32_e32 v59, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
; SI-NEXT: v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: v_cvt_f16_f32_e32 v46, v12
; SI-NEXT: v_cvt_f16_f32_e32 v45, v15
; SI-NEXT: v_cvt_f16_f32_e32 v44, v14
@@ -19716,14 +19712,12 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v51, v25
; SI-NEXT: v_cvt_f16_f32_e32 v50, v24
; SI-NEXT: v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v36
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v36, v39
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -25282,6 +25276,14 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v36f16_to_v9f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -25298,36 +25300,28 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
; SI-NEXT: v_cvt_f16_f32_e32 v35, v1
; SI-NEXT: v_cvt_f16_f32_e32 v33, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: v_cvt_f16_f32_e32 v63, v5
; SI-NEXT: v_cvt_f16_f32_e32 v62, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v61, v7
; SI-NEXT: v_cvt_f16_f32_e32 v60, v6
; SI-NEXT: v_cvt_f16_f32_e32 v59, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v8
; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
; SI-NEXT: v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v47, v13
; SI-NEXT: v_cvt_f16_f32_e32 v46, v12
; SI-NEXT: v_cvt_f16_f32_e32 v45, v15
; SI-NEXT: v_cvt_f16_f32_e32 v44, v14
@@ -25342,14 +25336,12 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v51, v25
; SI-NEXT: v_cvt_f16_f32_e32 v50, v24
; SI-NEXT: v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v36
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v36, v39
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -26798,22 +26790,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v36i16_to_v36f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12
@@ -26838,6 +26814,22 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; kill: killed $vgpr48
; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr62
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr63
@@ -26865,7 +26857,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; kill: killed $vgpr48
; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -26892,7 +26884,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v47, v9
; SI-NEXT: v_cvt_f32_f16_e32 v60, v10
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v39
; SI-NEXT: v_cvt_f32_f16_e32 v45, v11
; SI-NEXT: v_cvt_f32_f16_e32 v58, v12
@@ -26977,7 +26969,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v27
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39
; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index 44cfd6c..8964ebd 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -3541,6 +3541,17 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v40i16_to_v20i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT: v_mov_b32_e32 v49, v12
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -3562,17 +3573,6 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT: v_mov_b32_e32 v49, v12
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v37, v20
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_mov_b32_e32 v39, v16
@@ -3594,13 +3594,10 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -4914,7 +4911,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
@@ -4947,7 +4944,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
@@ -4980,7 +4977,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
-; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
@@ -5073,7 +5070,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
@@ -5106,7 +5103,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
@@ -5139,7 +5136,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
@@ -8520,7 +8517,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
@@ -8553,7 +8550,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
@@ -8586,7 +8583,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
-; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
@@ -8679,7 +8676,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
@@ -8712,7 +8709,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
@@ -8745,7 +8742,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
@@ -11740,6 +11737,17 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v40i16_to_v20f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT: v_mov_b32_e32 v49, v12
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -11761,17 +11769,6 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT: v_mov_b32_e32 v49, v12
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v37, v20
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_mov_b32_e32 v39, v16
@@ -11793,13 +11790,10 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -13113,7 +13107,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
@@ -13146,7 +13140,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
@@ -13179,7 +13173,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
-; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
@@ -13272,7 +13266,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
@@ -13305,7 +13299,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
@@ -13338,7 +13332,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
@@ -16833,7 +16827,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
@@ -16866,7 +16860,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
@@ -16899,7 +16893,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
-; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
@@ -16992,7 +16986,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
@@ -17025,7 +17019,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
@@ -17058,7 +17052,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
@@ -19249,6 +19243,17 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v40i16_to_v10i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT: v_mov_b32_e32 v49, v12
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -19270,17 +19275,6 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT: v_mov_b32_e32 v49, v12
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v37, v20
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_mov_b32_e32 v39, v16
@@ -19302,13 +19296,10 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -20622,7 +20613,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
@@ -20655,7 +20646,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
@@ -20688,7 +20679,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
-; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
@@ -20781,7 +20772,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
@@ -20814,7 +20805,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
@@ -20847,7 +20838,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
@@ -24238,7 +24229,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
@@ -24271,7 +24262,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
@@ -24304,7 +24295,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
-; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
@@ -24397,7 +24388,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
@@ -24430,7 +24421,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
@@ -24463,7 +24454,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
@@ -25988,6 +25979,17 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v40i16_to_v10f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT: v_mov_b32_e32 v49, v12
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -26009,17 +26011,6 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT: v_mov_b32_e32 v49, v12
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
; SI-NEXT: v_mov_b32_e32 v37, v20
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_mov_b32_e32 v39, v16
@@ -26041,13 +26032,10 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -27361,7 +27349,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
@@ -27394,7 +27382,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
@@ -27427,7 +27415,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
-; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
@@ -27520,7 +27508,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
@@ -27553,7 +27541,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
@@ -27586,7 +27574,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
@@ -31014,7 +31002,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
@@ -31047,7 +31035,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
@@ -31080,7 +31068,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
-; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
@@ -31173,7 +31161,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
@@ -31206,7 +31194,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
@@ -31239,7 +31227,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: s_clause 0xa ; 44-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
@@ -31389,6 +31377,17 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v40i16_to_v40f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
@@ -31405,17 +31404,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8
; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; kill: killed $vgpr40
; SI-NEXT: ; implicit-def: $vgpr40
@@ -31472,7 +31460,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: ; kill: killed $vgpr40
; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -31523,7 +31511,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v30
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_cvt_f32_f16_e32 v40, v48
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr3
@@ -31623,7 +31610,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v27
; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -31643,7 +31629,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v38
-; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48
; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index 87d5157..ed407c1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -3792,6 +3792,17 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v44i16_to_v22i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v39, v16
+; SI-NEXT: v_mov_b32_e32 v48, v14
+; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -3814,17 +3825,6 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v39, v16
-; SI-NEXT: v_mov_b32_e32 v48, v14
-; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3
@@ -3842,9 +3842,8 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -5329,7 +5328,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
@@ -5362,7 +5361,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
@@ -5395,7 +5394,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
-; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
@@ -5496,7 +5495,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
@@ -5529,7 +5528,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
@@ -5562,7 +5561,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
@@ -9311,7 +9310,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
@@ -9344,7 +9343,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
@@ -9377,7 +9376,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
-; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
@@ -9478,7 +9477,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
@@ -9511,7 +9510,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
@@ -9544,7 +9543,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
@@ -12755,6 +12754,17 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v44i16_to_v22f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v39, v16
+; SI-NEXT: v_mov_b32_e32 v48, v14
+; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -12777,17 +12787,6 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v39, v16
-; SI-NEXT: v_mov_b32_e32 v48, v14
-; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3
@@ -12805,9 +12804,8 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -14292,7 +14290,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
@@ -14325,7 +14323,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
@@ -14358,7 +14356,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
-; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
@@ -14459,7 +14457,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
@@ -14492,7 +14490,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
@@ -14525,7 +14523,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
@@ -18407,7 +18405,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
@@ -18440,7 +18438,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
@@ -18473,7 +18471,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
-; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
@@ -18574,7 +18572,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
@@ -18607,7 +18605,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
@@ -18640,7 +18638,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
@@ -21004,6 +21002,17 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v44i16_to_v11i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v39, v16
+; SI-NEXT: v_mov_b32_e32 v48, v14
+; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -21026,17 +21035,6 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v39, v16
-; SI-NEXT: v_mov_b32_e32 v48, v14
-; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3
@@ -21054,9 +21052,8 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -22541,7 +22538,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
@@ -22574,7 +22571,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
@@ -22607,7 +22604,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
-; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
@@ -22708,7 +22705,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
@@ -22741,7 +22738,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
@@ -22774,7 +22771,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
@@ -26535,7 +26532,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
@@ -26568,7 +26565,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
@@ -26601,7 +26598,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
-; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
@@ -26702,7 +26699,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
@@ -26735,7 +26732,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
@@ -26768,7 +26765,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
@@ -28420,6 +28417,17 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v44i16_to_v11f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v39, v16
+; SI-NEXT: v_mov_b32_e32 v48, v14
+; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -28442,17 +28450,6 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v39, v16
-; SI-NEXT: v_mov_b32_e32 v48, v14
-; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_mov_b32_e32 v38, v18
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3
@@ -28470,9 +28467,8 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -29957,7 +29953,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
@@ -29990,7 +29986,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
@@ -30023,7 +30019,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
-; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
@@ -30124,7 +30120,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
@@ -30157,7 +30153,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
@@ -30190,7 +30186,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
@@ -33996,7 +33992,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
@@ -34029,7 +34025,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
@@ -34062,7 +34058,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
-; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
@@ -34163,7 +34159,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
@@ -34196,7 +34192,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
@@ -34229,7 +34225,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: s_clause 0xc ; 52-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index fb2e94f..9ec3f5c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -4045,6 +4045,22 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v48i16_to_v24i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v48, v14
+; SI-NEXT: v_mov_b32_e32 v49, v12
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -4069,22 +4085,6 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v48, v14
-; SI-NEXT: v_mov_b32_e32 v49, v12
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5
@@ -4100,21 +4100,14 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -5806,7 +5799,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
@@ -5839,7 +5832,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
@@ -5872,7 +5865,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
-; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
@@ -5979,7 +5972,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
@@ -6012,7 +6005,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
@@ -6045,7 +6038,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
@@ -8179,6 +8172,8 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v48f16_to_v24i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -8195,8 +8190,6 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -8223,34 +8216,34 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; SI-NEXT: v_cvt_f16_f32_e32 v55, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: v_cvt_f16_f32_e32 v52, v2
; SI-NEXT: v_cvt_f16_f32_e32 v51, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
+; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: v_cvt_f16_f32_e32 v49, v7
; SI-NEXT: v_cvt_f16_f32_e32 v48, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: v_cvt_f16_f32_e32 v38, v8
; SI-NEXT: v_cvt_f16_f32_e32 v37, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: v_cvt_f16_f32_e32 v35, v13
; SI-NEXT: v_cvt_f16_f32_e32 v34, v12
-; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
@@ -10214,7 +10207,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
@@ -10247,7 +10240,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
@@ -10280,7 +10273,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
-; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
@@ -10387,7 +10380,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
@@ -10420,7 +10413,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
@@ -10453,7 +10446,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
@@ -13882,6 +13875,22 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v48i16_to_v24f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v48, v14
+; SI-NEXT: v_mov_b32_e32 v49, v12
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -13906,22 +13915,6 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v48, v14
-; SI-NEXT: v_mov_b32_e32 v49, v12
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5
@@ -13937,21 +13930,14 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -15643,7 +15629,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
@@ -15676,7 +15662,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
@@ -15709,7 +15695,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
-; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
@@ -15816,7 +15802,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
@@ -15849,7 +15835,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
@@ -15882,7 +15868,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
@@ -18157,6 +18143,8 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v48f16_to_v24f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -18173,8 +18161,6 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -18201,34 +18187,34 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; SI-NEXT: v_cvt_f16_f32_e32 v55, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: v_cvt_f16_f32_e32 v52, v2
; SI-NEXT: v_cvt_f16_f32_e32 v51, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
+; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: v_cvt_f16_f32_e32 v49, v7
; SI-NEXT: v_cvt_f16_f32_e32 v48, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: v_cvt_f16_f32_e32 v38, v8
; SI-NEXT: v_cvt_f16_f32_e32 v37, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: v_cvt_f16_f32_e32 v35, v13
; SI-NEXT: v_cvt_f16_f32_e32 v34, v12
-; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
@@ -20192,7 +20178,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
@@ -20225,7 +20211,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
@@ -20258,7 +20244,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
-; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
@@ -20365,7 +20351,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
@@ -20398,7 +20384,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
@@ -20431,7 +20417,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
@@ -22982,6 +22968,22 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v48i16_to_v12i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v48, v14
+; SI-NEXT: v_mov_b32_e32 v49, v12
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -23006,22 +23008,6 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v48, v14
-; SI-NEXT: v_mov_b32_e32 v49, v12
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5
@@ -23037,21 +23023,14 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -24743,7 +24722,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
@@ -24776,7 +24755,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
@@ -24809,7 +24788,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
-; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
@@ -24916,7 +24895,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
@@ -24949,7 +24928,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
@@ -24982,7 +24961,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
@@ -27128,6 +27107,8 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v48f16_to_v12i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -27144,8 +27125,6 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -27172,34 +27151,34 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; SI-NEXT: v_cvt_f16_f32_e32 v55, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: v_cvt_f16_f32_e32 v52, v2
; SI-NEXT: v_cvt_f16_f32_e32 v51, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
+; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: v_cvt_f16_f32_e32 v49, v7
; SI-NEXT: v_cvt_f16_f32_e32 v48, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: v_cvt_f16_f32_e32 v38, v8
; SI-NEXT: v_cvt_f16_f32_e32 v37, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: v_cvt_f16_f32_e32 v35, v13
; SI-NEXT: v_cvt_f16_f32_e32 v34, v12
-; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
@@ -29163,7 +29142,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
@@ -29196,7 +29175,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
@@ -29229,7 +29208,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
-; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
@@ -29336,7 +29315,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
@@ -29369,7 +29348,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
@@ -29402,7 +29381,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
@@ -31199,6 +31178,22 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v48i16_to_v12f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v48, v14
+; SI-NEXT: v_mov_b32_e32 v49, v12
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -31223,22 +31218,6 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v48, v14
-; SI-NEXT: v_mov_b32_e32 v49, v12
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5
@@ -31254,21 +31233,14 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -32960,7 +32932,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
@@ -32993,7 +32965,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
@@ -33026,7 +32998,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
-; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
@@ -33133,7 +33105,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
@@ -33166,7 +33138,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
@@ -33199,7 +33171,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
@@ -35392,6 +35364,8 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
; SI-LABEL: bitcast_v48f16_to_v12f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -35408,8 +35382,6 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -35436,34 +35408,34 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; SI-NEXT: v_cvt_f16_f32_e32 v55, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v3
; SI-NEXT: v_cvt_f16_f32_e32 v52, v2
; SI-NEXT: v_cvt_f16_f32_e32 v51, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
+; SI-NEXT: v_cvt_f16_f32_e32 v50, v4
; SI-NEXT: v_cvt_f16_f32_e32 v49, v7
; SI-NEXT: v_cvt_f16_f32_e32 v48, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v9
; SI-NEXT: v_cvt_f16_f32_e32 v38, v8
; SI-NEXT: v_cvt_f16_f32_e32 v37, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v10
; SI-NEXT: v_cvt_f16_f32_e32 v35, v13
; SI-NEXT: v_cvt_f16_f32_e32 v34, v12
-; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v15
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
@@ -37427,7 +37399,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
@@ -37460,7 +37432,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
@@ -37493,7 +37465,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
-; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
@@ -37600,7 +37572,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
@@ -37633,7 +37605,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
@@ -37666,7 +37638,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: s_clause 0xe ; 60-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
@@ -41255,6 +41227,11 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; SI-LABEL: bitcast_v48f16_to_v48i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
@@ -41271,11 +41248,6 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_cvt_f16_f32_e32 v61, v2
; SI-NEXT: v_cvt_f16_f32_e32 v55, v3
@@ -41320,16 +41292,12 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v50, s25
; SI-NEXT: v_cvt_f16_f32_e32 v16, s26
; SI-NEXT: v_cvt_f16_f32_e32 v29, s29
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v32
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f16_f32_e32 v43, v33
; SI-NEXT: v_cvt_f16_f32_e32 v32, v20
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v25, v35
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v37
; SI-NEXT: v_cvt_f16_f32_e32 v20, s22
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index 07cdbef..c7a1993 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -4341,6 +4341,19 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v52i16_to_v26i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -4366,19 +4379,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5
@@ -4394,17 +4394,12 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -4429,9 +4424,10 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -4443,10 +4439,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22
; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -5032,7 +5027,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v57, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24
@@ -5099,6 +5093,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -5231,6 +5226,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB14_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -5245,9 +5243,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6
; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6
; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6
@@ -5266,6 +5261,10 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -5294,10 +5293,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -6287,7 +6282,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -6320,7 +6315,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -6353,7 +6348,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -6465,7 +6460,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -6498,7 +6493,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -6531,7 +6526,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -9760,7 +9755,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v57, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24
@@ -9827,6 +9821,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -9959,6 +9954,9 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB18_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -9973,9 +9971,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6
@@ -9995,6 +9990,10 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -10023,10 +10022,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -10295,14 +10290,28 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v39
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -10318,22 +10327,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v51
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB19_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -10342,8 +10335,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12
@@ -10363,10 +10356,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36
@@ -10407,11 +10398,11 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v25, v38, v25
; SI-NEXT: s_cbranch_execnz .LBB19_3
; SI-NEXT: .LBB19_2: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v53
; SI-NEXT: v_cvt_f32_f16_e32 v9, v40
; SI-NEXT: v_cvt_f32_f16_e32 v10, v55
@@ -10425,7 +10416,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v12, v47
; SI-NEXT: v_cvt_f32_f16_e32 v13, v60
; SI-NEXT: v_cvt_f32_f16_e32 v15, v52
@@ -10463,7 +10453,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -11113,7 +11102,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -11146,7 +11135,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -11179,7 +11168,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -11291,7 +11280,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -11324,7 +11313,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -11357,7 +11346,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -15076,6 +15065,19 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v52i16_to_v26f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -15101,19 +15103,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5
@@ -15129,17 +15118,12 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -15164,9 +15148,10 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -15178,10 +15163,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22
; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -15767,7 +15751,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v57, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24
@@ -15834,6 +15817,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -15966,6 +15950,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB30_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -15980,9 +15967,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6
; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6
; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6
@@ -16001,6 +15985,10 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -16029,10 +16017,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -17022,7 +17006,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -17055,7 +17039,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -17088,7 +17072,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -17200,7 +17184,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -17233,7 +17217,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -17266,7 +17250,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -20653,7 +20637,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v57, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24
@@ -20720,6 +20703,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -20852,6 +20836,9 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB34_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -20866,9 +20853,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6
@@ -20888,6 +20872,10 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -20916,10 +20904,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -21188,14 +21172,28 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v39
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -21211,22 +21209,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v51
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB35_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -21235,8 +21217,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12
@@ -21256,10 +21238,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36
@@ -21300,11 +21280,11 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: v_or_b32_e32 v25, v38, v25
; SI-NEXT: s_cbranch_execnz .LBB35_3
; SI-NEXT: .LBB35_2: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v53
; SI-NEXT: v_cvt_f32_f16_e32 v9, v40
; SI-NEXT: v_cvt_f32_f16_e32 v10, v55
@@ -21318,7 +21298,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v12, v47
; SI-NEXT: v_cvt_f32_f16_e32 v13, v60
; SI-NEXT: v_cvt_f32_f16_e32 v15, v52
@@ -21356,7 +21335,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -22006,7 +21984,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -22039,7 +22017,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -22072,7 +22050,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -22184,7 +22162,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -22217,7 +22195,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -22250,7 +22228,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -25023,6 +25001,19 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v52i16_to_v13i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -25048,19 +25039,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5
@@ -25076,17 +25054,12 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -25111,9 +25084,10 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -25125,10 +25099,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22
; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -25714,7 +25687,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v57, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24
@@ -25781,6 +25753,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -25913,6 +25886,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB42_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -25927,9 +25903,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6
; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6
; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6
@@ -25948,6 +25921,10 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -25976,10 +25953,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -26969,7 +26942,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -27002,7 +26975,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -27035,7 +27008,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -27147,7 +27120,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -27180,7 +27153,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -27213,7 +27186,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -30457,7 +30430,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v57, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24
@@ -30524,6 +30496,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -30656,6 +30629,9 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB46_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -30670,9 +30646,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6
@@ -30692,6 +30665,10 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -30720,10 +30697,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -30992,14 +30965,28 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v39
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -31015,22 +31002,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v51
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB47_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -31039,8 +31010,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12
@@ -31060,10 +31031,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36
@@ -31104,11 +31073,11 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v25, v38, v25
; SI-NEXT: s_cbranch_execnz .LBB47_3
; SI-NEXT: .LBB47_2: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v53
; SI-NEXT: v_cvt_f32_f16_e32 v9, v40
; SI-NEXT: v_cvt_f32_f16_e32 v10, v55
@@ -31122,7 +31091,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v12, v47
; SI-NEXT: v_cvt_f32_f16_e32 v13, v60
; SI-NEXT: v_cvt_f32_f16_e32 v15, v52
@@ -31160,7 +31128,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -31810,7 +31777,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -31843,7 +31810,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -31876,7 +31843,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -31988,7 +31955,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -32021,7 +31988,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -32054,7 +32021,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -34053,6 +34020,19 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v52i16_to_v13f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v50, v10
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v52, v6
+; SI-NEXT: v_mov_b32_e32 v53, v4
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -34078,19 +34058,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v50, v10
-; SI-NEXT: v_mov_b32_e32 v51, v8
-; SI-NEXT: v_mov_b32_e32 v52, v6
-; SI-NEXT: v_mov_b32_e32 v53, v4
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT: v_mov_b32_e32 v49, v12
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5
@@ -34106,17 +34073,12 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -34141,9 +34103,10 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -34155,10 +34118,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22
; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -34744,7 +34706,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v57, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24
@@ -34811,6 +34772,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -34943,6 +34905,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB50_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -34957,9 +34922,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6
; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6
; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6
@@ -34978,6 +34940,10 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -35006,10 +34972,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -35999,7 +35961,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -36032,7 +35994,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -36065,7 +36027,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -36177,7 +36139,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -36210,7 +36172,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -36243,7 +36205,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -39539,7 +39501,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v57, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24
@@ -39606,6 +39567,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -39738,6 +39700,9 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB54_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -39752,9 +39717,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6
@@ -39774,6 +39736,10 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -39802,10 +39768,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -40074,14 +40036,28 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v39
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -40097,22 +40073,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v51
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB55_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -40121,8 +40081,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12
@@ -40142,10 +40102,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36
@@ -40186,11 +40144,11 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: v_or_b32_e32 v25, v38, v25
; SI-NEXT: s_cbranch_execnz .LBB55_3
; SI-NEXT: .LBB55_2: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v53
; SI-NEXT: v_cvt_f32_f16_e32 v9, v40
; SI-NEXT: v_cvt_f32_f16_e32 v10, v55
@@ -40204,7 +40162,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v12, v47
; SI-NEXT: v_cvt_f32_f16_e32 v13, v60
; SI-NEXT: v_cvt_f32_f16_e32 v15, v52
@@ -40242,7 +40199,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -40892,7 +40848,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -40925,7 +40881,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -40958,7 +40914,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -41070,7 +41026,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -41103,7 +41059,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -41136,7 +41092,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -45248,6 +45204,15 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; SI-LABEL: bitcast_v52f16_to_v52i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
@@ -45264,15 +45229,6 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24
; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: v_cvt_f16_f32_e32 v58, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v3
@@ -45317,26 +45273,19 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v41, s21
; SI-NEXT: v_cvt_f16_f32_e32 v16, s26
; SI-NEXT: v_cvt_f16_f32_e32 v54, s29
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_cvt_f16_f32_e32 v53, v32
-; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v33
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v30, v35
; SI-NEXT: v_cvt_f16_f32_e32 v35, v20
; SI-NEXT: v_cvt_f16_f32_e32 v33, v24
; SI-NEXT: v_cvt_f16_f32_e32 v31, v28
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cvt_f16_f32_e32 v55, v36
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cvt_f16_f32_e32 v4, v38
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f16_f32_e32 v27, v39
-; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v49
; SI-NEXT: v_cvt_f16_f32_e32 v24, s18
; SI-NEXT: v_cvt_f16_f32_e32 v20, s22
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index 8eb71e9..77df03d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -4665,6 +4665,11 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v56i16_to_v28i32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -4694,11 +4699,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5
@@ -4715,9 +4715,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -5413,7 +5412,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v59, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26
@@ -5486,6 +5484,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -5634,6 +5633,9 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB14_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -5648,9 +5650,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6
@@ -5669,6 +5668,10 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -5697,10 +5700,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -6780,7 +6779,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -6813,7 +6812,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -6846,7 +6845,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -6960,7 +6959,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -6993,7 +6992,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -7026,7 +7025,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -10560,7 +10559,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v59, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26
@@ -10633,6 +10631,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -10781,6 +10780,9 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB18_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -10795,9 +10797,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
@@ -10817,6 +10816,10 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -10845,10 +10848,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -11148,7 +11147,20 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -11156,7 +11168,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v51
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -11188,19 +11199,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v0, v43
; SI-NEXT: v_cvt_f16_f32_e32 v43, s17
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB19_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
@@ -11217,11 +11215,11 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_mov_b32_e32 v48, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_mov_b32_e32 v61, v44
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54
; SI-NEXT: v_mov_b32_e32 v39, v11
@@ -11299,6 +11297,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v27, v50, v27
; SI-NEXT: s_cbranch_execnz .LBB19_3
; SI-NEXT: .LBB19_2: ; %cmp.true
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v43
; SI-NEXT: v_cvt_f32_f16_e32 v2, v54
; SI-NEXT: v_cvt_f32_f16_e32 v1, v55
@@ -11317,7 +11316,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: v_cvt_f32_f16_e32 v2, v49
; SI-NEXT: v_cvt_f32_f16_e32 v3, v39
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v33
; SI-NEXT: v_cvt_f32_f16_e32 v8, v47
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
@@ -11585,7 +11583,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(6)
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -12044,7 +12041,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -12077,7 +12074,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -12110,7 +12107,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -12224,7 +12221,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -12257,7 +12254,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -12290,7 +12287,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -16290,6 +16287,11 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v56i16_to_v28f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -16319,11 +16321,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5
@@ -16340,9 +16337,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -17038,7 +17034,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v59, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26
@@ -17111,6 +17106,7 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -17259,6 +17255,9 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB30_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -17273,9 +17272,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6
@@ -17294,6 +17290,10 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -17322,10 +17322,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -18405,7 +18401,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -18438,7 +18434,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -18471,7 +18467,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -18585,7 +18581,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -18618,7 +18614,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -18651,7 +18647,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -22343,7 +22339,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v59, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26
@@ -22416,6 +22411,7 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -22564,6 +22560,9 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB34_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -22578,9 +22577,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
@@ -22600,6 +22596,10 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -22628,10 +22628,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -22931,7 +22927,20 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -22939,7 +22948,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v51
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -22971,19 +22979,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v0, v43
; SI-NEXT: v_cvt_f16_f32_e32 v43, s17
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB35_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
@@ -23000,11 +22995,11 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_mov_b32_e32 v48, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_mov_b32_e32 v61, v44
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54
; SI-NEXT: v_mov_b32_e32 v39, v11
@@ -23082,6 +23077,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: v_or_b32_e32 v27, v50, v27
; SI-NEXT: s_cbranch_execnz .LBB35_3
; SI-NEXT: .LBB35_2: ; %cmp.true
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v43
; SI-NEXT: v_cvt_f32_f16_e32 v2, v54
; SI-NEXT: v_cvt_f32_f16_e32 v1, v55
@@ -23100,7 +23096,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: v_cvt_f32_f16_e32 v2, v49
; SI-NEXT: v_cvt_f32_f16_e32 v3, v39
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v33
; SI-NEXT: v_cvt_f32_f16_e32 v8, v47
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
@@ -23368,7 +23363,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(6)
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -23827,7 +23821,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -23860,7 +23854,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -23893,7 +23887,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -24007,7 +24001,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -24040,7 +24034,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -24073,7 +24067,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -27080,6 +27074,11 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v56i16_to_v14i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -27109,11 +27108,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5
@@ -27130,9 +27124,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -27828,7 +27821,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v59, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26
@@ -27901,6 +27893,7 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -28049,6 +28042,9 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB42_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -28063,9 +28059,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6
@@ -28084,6 +28077,10 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -28112,10 +28109,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -29195,7 +29188,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -29228,7 +29221,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -29261,7 +29254,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -29375,7 +29368,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -29408,7 +29401,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -29441,7 +29434,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -32989,7 +32982,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v59, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26
@@ -33062,6 +33054,7 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -33210,6 +33203,9 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB46_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -33224,9 +33220,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
@@ -33246,6 +33239,10 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -33274,10 +33271,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -33577,7 +33570,20 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -33585,7 +33591,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v51
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -33617,19 +33622,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v0, v43
; SI-NEXT: v_cvt_f16_f32_e32 v43, s17
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB47_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
@@ -33646,11 +33638,11 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_mov_b32_e32 v48, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_mov_b32_e32 v61, v44
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54
; SI-NEXT: v_mov_b32_e32 v39, v11
@@ -33728,6 +33720,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v27, v50, v27
; SI-NEXT: s_cbranch_execnz .LBB47_3
; SI-NEXT: .LBB47_2: ; %cmp.true
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v43
; SI-NEXT: v_cvt_f32_f16_e32 v2, v54
; SI-NEXT: v_cvt_f32_f16_e32 v1, v55
@@ -33746,7 +33739,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: v_cvt_f32_f16_e32 v2, v49
; SI-NEXT: v_cvt_f32_f16_e32 v3, v39
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v33
; SI-NEXT: v_cvt_f32_f16_e32 v8, v47
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
@@ -34014,7 +34006,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(6)
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -34473,7 +34464,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -34506,7 +34497,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -34539,7 +34530,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -34653,7 +34644,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -34686,7 +34677,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -34719,7 +34710,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -36898,6 +36889,11 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; SI-LABEL: bitcast_v56i16_to_v14f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v54, v2
+; SI-NEXT: v_mov_b32_e32 v55, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -36927,11 +36923,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT: v_mov_b32_e32 v53, v4
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5
@@ -36948,9 +36939,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -37646,7 +37636,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v59, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26
@@ -37719,6 +37708,7 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -37867,6 +37857,9 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB50_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -37881,9 +37874,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6
@@ -37902,6 +37892,10 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -37930,10 +37924,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -39013,7 +39003,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -39046,7 +39036,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -39079,7 +39069,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -39193,7 +39183,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -39226,7 +39216,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -39259,7 +39249,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -42860,7 +42850,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v59, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26
@@ -42933,6 +42922,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -43081,6 +43071,9 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB54_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -43095,9 +43088,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
@@ -43117,6 +43107,10 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -43145,10 +43139,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -43448,7 +43438,20 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -43456,7 +43459,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v51
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -43488,19 +43490,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v0, v43
; SI-NEXT: v_cvt_f16_f32_e32 v43, s17
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB55_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
@@ -43517,11 +43506,11 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_mov_b32_e32 v48, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_mov_b32_e32 v61, v44
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54
; SI-NEXT: v_mov_b32_e32 v39, v11
@@ -43599,6 +43588,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: v_or_b32_e32 v27, v50, v27
; SI-NEXT: s_cbranch_execnz .LBB55_3
; SI-NEXT: .LBB55_2: ; %cmp.true
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v43
; SI-NEXT: v_cvt_f32_f16_e32 v2, v54
; SI-NEXT: v_cvt_f32_f16_e32 v1, v55
@@ -43617,7 +43607,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: v_cvt_f32_f16_e32 v2, v49
; SI-NEXT: v_cvt_f32_f16_e32 v3, v39
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v33
; SI-NEXT: v_cvt_f32_f16_e32 v8, v47
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
@@ -43885,7 +43874,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(6)
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -44344,7 +44332,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -44377,7 +44365,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -44410,7 +44398,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -44524,7 +44512,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -44557,7 +44545,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -44590,7 +44578,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index 93c11f1..c9e5771 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -5032,40 +5032,53 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -5096,27 +5109,10 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -5201,7 +5197,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; kill: killed $vgpr30
; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55
; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61
; SI-NEXT: ; kill: killed $vgpr30
@@ -5346,7 +5341,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55
; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -5494,7 +5488,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_v60i16_to_v30i32:
@@ -5776,7 +5770,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v61, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28
@@ -5855,6 +5848,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -6019,6 +6013,9 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB14_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -6033,9 +6030,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6
; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6
; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6
@@ -6054,6 +6048,10 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -6082,10 +6080,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -7241,7 +7235,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -7274,7 +7268,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -7307,7 +7301,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -7424,7 +7418,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -7457,7 +7451,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -7490,7 +7484,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -10345,6 +10339,9 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32
@@ -10373,23 +10370,12 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
; SI-NEXT: v_cvt_f16_f32_e32 v59, v1
; SI-NEXT: v_cvt_f16_f32_e32 v57, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v56, v2
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v60
+; SI-NEXT: v_cvt_f16_f32_e32 v56, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v7
@@ -10399,8 +10385,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v9
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v8
@@ -10422,9 +10406,18 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
@@ -10434,6 +10427,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
@@ -10471,7 +10465,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v61
; SI-NEXT: v_cvt_f16_f32_e32 v61, v49
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v49, v55
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -10486,6 +10479,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v32
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -11357,7 +11351,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v61, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28
@@ -11436,6 +11429,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -11600,6 +11594,9 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB18_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -11614,9 +11611,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6
@@ -11636,6 +11630,10 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -11664,10 +11662,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -11988,12 +11982,35 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: v_cvt_f16_f32_e32 v50, v54
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: v_cvt_f16_f32_e32 v31, v40
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v33
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
@@ -12003,7 +12020,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v44
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -12012,7 +12029,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v46
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -12021,7 +12038,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v57
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v58
; SI-NEXT: v_cvt_f16_f32_e32 v58, s16
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -12032,38 +12049,12 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB19_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v3, v10, v3
-; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34
; SI-NEXT: v_mov_b32_e32 v33, v32
; SI-NEXT: v_or_b32_e32 v10, v32, v10
@@ -12088,12 +12079,12 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
; SI-NEXT: v_or_b32_e32 v1, v12, v1
; SI-NEXT: v_or_b32_e32 v2, v11, v2
@@ -12202,12 +12193,10 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v40, v44
; SI-NEXT: s_cbranch_vccnz .LBB19_5
; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: v_cvt_f32_f16_e32 v1, v58
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f32_f16_e32 v8, v33
@@ -12993,7 +12982,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -13026,7 +13015,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -13059,7 +13048,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -13176,7 +13165,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -13209,7 +13198,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -13242,7 +13231,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -17570,40 +17559,53 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -17634,27 +17636,10 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -17739,7 +17724,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; kill: killed $vgpr30
; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55
; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61
; SI-NEXT: ; kill: killed $vgpr30
@@ -17884,7 +17868,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55
; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -18032,7 +18015,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_v60i16_to_v30f32:
@@ -18314,7 +18297,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v61, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28
@@ -18393,6 +18375,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -18557,6 +18540,9 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB30_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -18571,9 +18557,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6
; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6
; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6
@@ -18592,6 +18575,10 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -18620,10 +18607,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -19779,7 +19762,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -19812,7 +19795,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -19845,7 +19828,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -19962,7 +19945,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -19995,7 +19978,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -20028,7 +20011,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -23044,6 +23027,9 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32
@@ -23072,23 +23058,12 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
; SI-NEXT: v_cvt_f16_f32_e32 v59, v1
; SI-NEXT: v_cvt_f16_f32_e32 v57, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v56, v2
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v60
+; SI-NEXT: v_cvt_f16_f32_e32 v56, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v7
@@ -23098,8 +23073,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v9
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v8
@@ -23121,9 +23094,18 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
@@ -23133,6 +23115,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
@@ -23170,7 +23153,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v61
; SI-NEXT: v_cvt_f16_f32_e32 v61, v49
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v49, v55
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23185,6 +23167,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v32
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -24056,7 +24039,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v61, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28
@@ -24135,6 +24117,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -24299,6 +24282,9 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB34_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -24313,9 +24299,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6
@@ -24335,6 +24318,10 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -24363,10 +24350,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -24687,12 +24670,35 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: v_cvt_f16_f32_e32 v50, v54
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: v_cvt_f16_f32_e32 v31, v40
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v33
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
@@ -24702,7 +24708,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v44
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -24711,7 +24717,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v46
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -24720,7 +24726,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v57
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v58
; SI-NEXT: v_cvt_f16_f32_e32 v58, s16
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -24731,38 +24737,12 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB35_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v3, v10, v3
-; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34
; SI-NEXT: v_mov_b32_e32 v33, v32
; SI-NEXT: v_or_b32_e32 v10, v32, v10
@@ -24787,12 +24767,12 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
; SI-NEXT: v_or_b32_e32 v1, v12, v1
; SI-NEXT: v_or_b32_e32 v2, v11, v2
@@ -24901,12 +24881,10 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: v_mov_b32_e32 v40, v44
; SI-NEXT: s_cbranch_vccnz .LBB35_5
; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: v_cvt_f32_f16_e32 v1, v58
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f32_f16_e32 v8, v33
@@ -25692,7 +25670,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -25725,7 +25703,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -25758,7 +25736,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -25875,7 +25853,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -25908,7 +25886,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -25941,7 +25919,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -29240,40 +29218,53 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -29304,27 +29295,10 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -29409,7 +29383,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; kill: killed $vgpr30
; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55
; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61
; SI-NEXT: ; kill: killed $vgpr30
@@ -29554,7 +29527,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55
; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -29702,7 +29674,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_v60i16_to_v15i64:
@@ -29984,7 +29956,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v61, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28
@@ -30063,6 +30034,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -30227,6 +30199,9 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB42_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -30241,9 +30216,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6
; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6
; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6
@@ -30262,6 +30234,10 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -30290,10 +30266,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -31449,7 +31421,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -31482,7 +31454,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -31515,7 +31487,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -31632,7 +31604,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -31665,7 +31637,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -31698,7 +31670,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -34570,6 +34542,9 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32
@@ -34598,23 +34573,12 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
; SI-NEXT: v_cvt_f16_f32_e32 v59, v1
; SI-NEXT: v_cvt_f16_f32_e32 v57, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v56, v2
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v60
+; SI-NEXT: v_cvt_f16_f32_e32 v56, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v7
@@ -34624,8 +34588,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v9
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v8
@@ -34647,9 +34609,18 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
@@ -34659,6 +34630,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
@@ -34696,7 +34668,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v61
; SI-NEXT: v_cvt_f16_f32_e32 v61, v49
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v49, v55
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -34711,6 +34682,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v32
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -35582,7 +35554,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v61, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28
@@ -35661,6 +35632,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -35825,6 +35797,9 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB46_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -35839,9 +35814,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6
@@ -35861,6 +35833,10 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -35889,10 +35865,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -36213,12 +36185,35 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: v_cvt_f16_f32_e32 v50, v54
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: v_cvt_f16_f32_e32 v31, v40
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v33
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
@@ -36228,7 +36223,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v44
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -36237,7 +36232,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v46
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -36246,7 +36241,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v57
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v58
; SI-NEXT: v_cvt_f16_f32_e32 v58, s16
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -36257,38 +36252,12 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB47_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v3, v10, v3
-; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34
; SI-NEXT: v_mov_b32_e32 v33, v32
; SI-NEXT: v_or_b32_e32 v10, v32, v10
@@ -36313,12 +36282,12 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
; SI-NEXT: v_or_b32_e32 v1, v12, v1
; SI-NEXT: v_or_b32_e32 v2, v11, v2
@@ -36427,12 +36396,10 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v40, v44
; SI-NEXT: s_cbranch_vccnz .LBB47_5
; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: v_cvt_f32_f16_e32 v1, v58
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f32_f16_e32 v8, v33
@@ -37218,7 +37185,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -37251,7 +37218,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -37284,7 +37251,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -37401,7 +37368,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -37434,7 +37401,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -37467,7 +37434,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -39888,40 +39855,53 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -39952,27 +39932,10 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -40057,7 +40020,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; kill: killed $vgpr30
; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55
; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61
; SI-NEXT: ; kill: killed $vgpr30
@@ -40202,7 +40164,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55
; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -40350,7 +40311,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_v60i16_to_v15f64:
@@ -40632,7 +40593,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v61, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28
@@ -40711,6 +40671,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -40875,6 +40836,9 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB50_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -40889,9 +40853,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6
; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6
; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6
@@ -40910,6 +40871,10 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -40938,10 +40903,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -42097,7 +42058,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -42130,7 +42091,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -42163,7 +42124,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -42280,7 +42241,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -42313,7 +42274,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -42346,7 +42307,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -45262,6 +45223,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32
@@ -45290,23 +45254,12 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
; SI-NEXT: v_cvt_f16_f32_e32 v59, v1
; SI-NEXT: v_cvt_f16_f32_e32 v57, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v56, v2
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v60
+; SI-NEXT: v_cvt_f16_f32_e32 v56, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v7
@@ -45316,8 +45269,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v9
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v8
@@ -45339,9 +45290,18 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v14
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v16
@@ -45351,6 +45311,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
@@ -45388,7 +45349,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v61
; SI-NEXT: v_cvt_f16_f32_e32 v61, v49
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v49, v55
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -45403,6 +45363,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v32
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -46274,7 +46235,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v61, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28
@@ -46353,6 +46313,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -46517,6 +46478,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB54_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -46531,9 +46495,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6
; GFX9-NEXT: s_movk_i32 s7, 0x200
; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6
@@ -46553,6 +46514,10 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(14)
+; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
; GFX9-NEXT: s_waitcnt vmcnt(14)
@@ -46581,10 +46546,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -46905,12 +46866,35 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: v_cvt_f16_f32_e32 v50, v54
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: v_cvt_f16_f32_e32 v31, v40
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v33
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
@@ -46920,7 +46904,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v44
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -46929,7 +46913,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v46
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -46938,7 +46922,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v57
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v58
; SI-NEXT: v_cvt_f16_f32_e32 v58, s16
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -46949,38 +46933,12 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB55_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v3, v10, v3
-; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34
; SI-NEXT: v_mov_b32_e32 v33, v32
; SI-NEXT: v_or_b32_e32 v10, v32, v10
@@ -47005,12 +46963,12 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
; SI-NEXT: v_or_b32_e32 v1, v12, v1
; SI-NEXT: v_or_b32_e32 v2, v11, v2
@@ -47119,12 +47077,10 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: v_mov_b32_e32 v40, v44
; SI-NEXT: s_cbranch_vccnz .LBB55_5
; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: v_cvt_f32_f16_e32 v1, v58
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f32_f16_e32 v8, v33
@@ -47910,7 +47866,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
@@ -47943,7 +47899,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
@@ -47976,7 +47932,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
@@ -48093,7 +48049,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
@@ -48126,7 +48082,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
@@ -48159,7 +48115,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload
; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
@@ -51893,27 +51849,27 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v55, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v22
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v40, v4
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v18
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v40, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v6
; SI-NEXT: v_cvt_f16_f32_e32 v53, v8
; SI-NEXT: v_cvt_f16_f32_e32 v8, v10
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v6
; SI-NEXT: v_cvt_f16_f32_e32 v49, v12
; SI-NEXT: v_cvt_f16_f32_e32 v6, v13
; SI-NEXT: v_cvt_f16_f32_e32 v37, v15
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v18
; SI-NEXT: v_cvt_f16_f32_e32 v52, v7
; SI-NEXT: v_cvt_f16_f32_e32 v7, v9
; SI-NEXT: v_cvt_f16_f32_e32 v48, v11
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_cvt_f16_f32_e32 v38, v16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v19
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
@@ -53259,6 +53215,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v1, v44
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -53285,10 +53243,13 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v1, v50
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v51, v11
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
@@ -53300,8 +53261,26 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f32_f16_e32 v3, v26
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v50
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v8
+; SI-NEXT: v_mov_b32_e32 v8, v48
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v44
+; SI-NEXT: v_lshr_b64 v[44:45], v[29:30], 16
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v38, v43
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
@@ -53329,17 +53308,11 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v18, v3, v5
; SI-NEXT: v_cvt_f32_f16_e32 v5, v37
; SI-NEXT: v_cvt_f32_f16_e32 v3, v16
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_mov_b32_e32 v51, v11
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v55, v5
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
@@ -53382,52 +53355,32 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f32_f16_e32 v3, v6
; SI-NEXT: v_lshr_b64 v[58:59], v[34:35], 16
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v50
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v8
-; SI-NEXT: v_mov_b32_e32 v8, v48
; SI-NEXT: v_cvt_f16_f32_e32 v48, v5
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48
-; SI-NEXT: v_or_b32_e32 v6, v3, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v4
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v44
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v31
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v4
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48
; SI-NEXT: v_mov_b32_e32 v59, v48
-; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v38
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60
-; SI-NEXT: v_or_b32_e32 v4, v3, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v38, v43
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshr_b64 v[47:48], v[17:18], 16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshr_b64 v[44:45], v[29:30], 16
+; SI-NEXT: v_or_b32_e32 v6, v3, v5
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v1, v24
; SI-NEXT: v_cvt_f32_f16_e32 v24, v8
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v31
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60
+; SI-NEXT: v_or_b32_e32 v4, v3, v4
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v1, v20
; SI-NEXT: v_cvt_f32_f16_e32 v20, v39
+; SI-NEXT: v_lshr_b64 v[47:48], v[17:18], 16
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20
; SI-NEXT: v_cvt_f16_f32_e32 v31, v20
@@ -53524,14 +53477,15 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v32, v41
; SI-NEXT: v_lshr_b64 v[40:41], v[21:22], 16
; SI-NEXT: v_lshr_b64 v[20:21], v[11:12], 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshr_b64 v[20:21], v[56:57], 16
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v11, v24
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16
; SI-NEXT: v_mov_b32_e32 v39, v31
; SI-NEXT: v_mov_b32_e32 v31, v60
@@ -53541,7 +53495,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v37, v55
; SI-NEXT: v_lshr_b64 v[55:56], v[5:6], 16
; SI-NEXT: v_lshr_b64 v[24:25], v[3:4], 16
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshr_b64 v[20:21], v[1:2], 16
; SI-NEXT: .LBB59_3: ; %end
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 30ad46d9..f3885d6 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -968,14 +968,14 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v29, vcc
; GFX8-NEXT: s_movk_i32 s4, 0x70
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[28:29]
-; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25]
; GFX8-NEXT: flat_load_dwordx4 v[16:19], v[16:17]
; GFX8-NEXT: flat_load_dwordx4 v[20:23], v[20:21]
-; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25]
; GFX8-NEXT: flat_load_dwordx4 v[28:31], v[28:29]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -9552,6 +9552,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v35, vcc, 36, v1
; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1
; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
@@ -9563,7 +9564,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1
; GFX8-NEXT: flat_load_ushort v44, v[1:2]
; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v48, vcc, 40, v1
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
index f8655a7..f465e3c 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
@@ -280,7 +280,7 @@ bb0:
br i1 %tmp, label %bb2, label %bb3
bb2:
- store volatile i32 17, ptr addrspace(1) undef
+ store volatile i32 17, ptr addrspace(1) poison
br label %bb4
bb3:
@@ -375,7 +375,7 @@ bb0:
br i1 %cmp0, label %bb2, label %bb1
bb1:
- %val = load volatile i32, ptr addrspace(4) undef
+ %val = load volatile i32, ptr addrspace(4) poison
%cmp1 = icmp eq i32 %val, 3
br i1 %cmp1, label %bb3, label %bb2
@@ -512,7 +512,7 @@ loop_body:
br label %loop
ret:
- store volatile i32 7, ptr addrspace(1) undef
+ store volatile i32 7, ptr addrspace(1) poison
ret void
}
@@ -622,7 +622,7 @@ bb14: ; preds = %bb13, %bb9
br label %bb19
bb19: ; preds = %bb14, %bb13, %bb9
- %tmp20 = phi i32 [ undef, %bb9 ], [ undef, %bb13 ], [ %tmp18, %bb14 ]
+ %tmp20 = phi i32 [ poison, %bb9 ], [ poison, %bb13 ], [ %tmp18, %bb14 ]
%tmp21 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %arg5
store i32 %tmp20, ptr addrspace(1) %tmp21, align 4
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index 6831380..04f8ad8 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -450,23 +450,38 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
@@ -976,23 +991,38 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
@@ -1159,24 +1189,23 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
; SDAG-GFX1100-NEXT: s_mov_b32 s9, s12
; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX1100-NEXT: s_mov_b32 s6, s3
-; SDAG-GFX1100-NEXT: v_mov_b32_e32 v4, s0
; SDAG-GFX1100-NEXT: s_mov_b32 s8, s1
; SDAG-GFX1100-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2
-; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-GFX1100-NEXT: v_mov_b32_e32 v4, s0
; SDAG-GFX1100-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
; SDAG-GFX1100-NEXT: s_clause 0x1
; SDAG-GFX1100-NEXT: s_load_b32 s13, s[4:5], 0x54
; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x44
; SDAG-GFX1100-NEXT: s_mov_b32 s5, s12
; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX1100-NEXT: s_mov_b32 s4, s3
; SDAG-GFX1100-NEXT: v_mov_b32_e32 v5, s0
+; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
+; SDAG-GFX1100-NEXT: s_mov_b32 s4, s3
+; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12
; SDAG-GFX1100-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2
; SDAG-GFX1100-NEXT: s_mov_b32 s2, s1
-; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12
; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; SDAG-GFX1100-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0)
@@ -1220,12 +1249,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
; GISEL-GFX1100-NEXT: s_mov_b32 s8, s1
; GISEL-GFX1100-NEXT: s_mov_b32 s9, s2
; GISEL-GFX1100-NEXT: s_mov_b32 s10, s3
-; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
; GISEL-GFX1100-NEXT: s_clause 0x1
; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x44
; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x54
; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-GFX1100-NEXT: v_mov_b32_e32 v5, s0
+; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
; GISEL-GFX1100-NEXT: s_mov_b32 s4, s1
; GISEL-GFX1100-NEXT: s_mov_b32 s5, s2
; GISEL-GFX1100-NEXT: s_mov_b32 s6, s3
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 8e12e7e..832e43f 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -4253,6 +4253,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@@ -4260,7 +4261,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; VI-NEXT: s_mov_b32 s38, -1
; VI-NEXT: s_mov_b32 s39, 0xe80000
; VI-NEXT: s_add_u32 s36, s36, s3
@@ -4272,7 +4272,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; VI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
; VI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_waitcnt vmcnt(7)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32
; VI-NEXT: s_swappc_b64 s[30:31], s[8:9]
; VI-NEXT: s_endpgm
@@ -4285,6 +4285,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; CI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@@ -4292,7 +4293,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; CI-NEXT: s_mov_b32 s38, -1
; CI-NEXT: s_mov_b32 s39, 0xe8f000
; CI-NEXT: s_add_u32 s36, s36, s3
@@ -4304,7 +4304,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; CI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
; CI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_waitcnt vmcnt(7)
+; CI-NEXT: s_waitcnt vmcnt(6)
; CI-NEXT: buffer_store_dword v31, off, s[36:39], s32
; CI-NEXT: s_swappc_b64 s[30:31], s[8:9]
; CI-NEXT: s_endpgm
@@ -4317,6 +4317,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@@ -4324,7 +4325,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
; GFX9-NEXT: s_mov_b32 s38, -1
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s3
@@ -4336,7 +4336,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_waitcnt vmcnt(7)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: buffer_store_dword v31, off, s[36:39], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GFX9-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 0cae0e5..5cc6845 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -851,12 +851,12 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp
; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: ds_write_b8 v0, v1 offset:9
+; CI-NEXT: ds_write_b8 v0, v2 offset:13
; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1
; CI-NEXT: ds_write_b8 v0, v1 offset:5
; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; CI-NEXT: ds_write_b8 v0, v1 offset:9
-; CI-NEXT: ds_write_b8 v0, v2 offset:13
; CI-NEXT: v_lshrrev_b32_e32 v1, 24, v2
; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 5fb50d0..da08f4f 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -3755,42 +3755,44 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v13
; CI-NEXT: v_cvt_f16_f32_e32 v13, v22
; CI-NEXT: v_or_b32_e32 v10, v14, v10
+; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4
+; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32
; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; CI-NEXT: v_or_b32_e32 v17, v18, v17
; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT: v_or_b32_e32 v17, v18, v17
; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16
; CI-NEXT: v_cvt_f16_f32_e32 v22, v27
-; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
-; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; CI-NEXT: v_or_b32_e32 v13, v16, v13
; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12
; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; CI-NEXT: v_or_b32_e32 v19, v20, v19
; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v21
; CI-NEXT: v_cvt_f16_f32_e32 v21, v30
; CI-NEXT: v_or_b32_e32 v20, v22, v20
; CI-NEXT: v_cvt_f16_f32_e32 v22, v29
-; CI-NEXT: s_waitcnt vmcnt(6)
+; CI-NEXT: s_waitcnt vmcnt(8)
; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; CI-NEXT: s_waitcnt vmcnt(5)
+; CI-NEXT: s_waitcnt vmcnt(7)
; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; CI-NEXT: v_or_b32_e32 v21, v22, v21
; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; CI-NEXT: s_waitcnt vmcnt(3)
+; CI-NEXT: s_waitcnt vmcnt(5)
; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
-; CI-NEXT: s_waitcnt vmcnt(2)
+; CI-NEXT: s_waitcnt vmcnt(4)
; CI-NEXT: v_cvt_f16_f32_e32 v32, v32
; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
; CI-NEXT: v_cvt_f32_f16_e32 v32, v32
@@ -3802,6 +3804,27 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
+; CI-NEXT: s_waitcnt vmcnt(6)
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: s_waitcnt vmcnt(5)
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; CI-NEXT: v_or_b32_e32 v14, v15, v14
+; CI-NEXT: s_waitcnt vmcnt(3)
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v16
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v18
+; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; CI-NEXT: v_or_b32_e32 v12, v12, v15
+; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0
+; CI-NEXT: v_or_b32_e32 v11, v16, v11
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -3968,28 +3991,6 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
; CI-NEXT: v_or_b32_e32 v31, v32, v31
; CI-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0
; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4
-; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32
-; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
-; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; CI-NEXT: v_or_b32_e32 v14, v15, v14
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v16
-; CI-NEXT: v_cvt_f16_f32_e32 v16, v18
-; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
-; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; CI-NEXT: v_or_b32_e32 v12, v12, v15
-; CI-NEXT: v_or_b32_e32 v11, v16, v11
-; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0
; CI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen
; CI-NEXT: v_add_i32_e32 v11, vcc, 64, v0
; CI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen
diff --git a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
index 279f429..590d69b 100644
--- a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
@@ -1,6 +1,19 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -passes=finalizebundle-test %s -o - | FileCheck %s
+--- |
+
+ @foo = addrspace(3) global i32 poison
+
+ define void @test_overlap() { unreachable }
+ define void @test_dead_redef() { unreachable }
+ define void @test_tied() { unreachable }
+ define void @test_mmo_merge1() { unreachable }
+ define void @test_mmo_merge2() { unreachable }
+ define void @test_mmo_drop() { unreachable }
+
+...
+
---
name: test_overlap
body: |
@@ -47,3 +60,42 @@ body: |
%1:vgpr_32 = COPY %0:vgpr_32
%2:vgpr_32 = V_FMAC_F16_e32 %1, %1, %0, implicit $mode, implicit $exec
...
+
+---
+name: test_mmo_merge1
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: test_mmo_merge1
+ ; CHECK: BUNDLE implicit-def %0, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3) {
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32
+ ; CHECK-NEXT: DS_WRITE_B32_gfx9 %1:vgpr_32, internal [[COPY]], 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+ ; CHECK-NEXT: }
+ %1:vgpr_32 = COPY %0:vgpr_32
+ DS_WRITE_B32_gfx9 %0, %1, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+...
+
+---
+name: test_mmo_merge2
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: test_mmo_merge2
+ ; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3), (store (s32) into @foo + 4, addrspace 3) {
+ ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+ ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3)
+ ; CHECK-NEXT: }
+ DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+ DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3)
+...
+
+---
+name: test_mmo_drop
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: test_mmo_drop
+ ; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec {
+ ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+ ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec
+ ; CHECK-NEXT: }
+ DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+ DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index eefc781..3572340 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -263,7 +263,7 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %v
; Uniformity edge cases
; --------------------------------------------------------------------------------
-@ptr.in.lds = internal addrspace(3) global ptr undef
+@ptr.in.lds = internal addrspace(3) global ptr poison
; Base pointer is uniform, but also in VGPRs
define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i32 %data) {
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll
index 32888d2..3d0e287 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll
@@ -54,7 +54,7 @@ define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_neg2048(ptr inreg %s
; Uniformity edge cases
; --------------------------------------------------------------------------------
-@ptr.in.lds = internal addrspace(3) global ptr undef
+@ptr.in.lds = internal addrspace(3) global ptr poison
; Base pointer is uniform, but also in VGPRs
define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %data) {
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index b750d28..ba81446 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -807,7 +807,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
; GFX10-NEXT: buffer_store_dword v95, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: v_writelane_b32 v100, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT: s_clause 0x1f
+; GFX10-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX10-NEXT: buffer_load_dword v95, off, s[0:3], s33
; GFX10-NEXT: buffer_load_dword v94, off, s[0:3], s33 offset:4
; GFX10-NEXT: buffer_load_dword v93, off, s[0:3], s33 offset:8
@@ -863,7 +863,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
; GFX11-NEXT: s_mov_b32 s1, return_100xi32@abs32@hi
; GFX11-NEXT: s_mov_b32 s0, return_100xi32@abs32@lo
; GFX11-NEXT: s_addk_i32 s32, 0x90
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:124
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:120
; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:116
@@ -898,7 +898,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
; GFX11-NEXT: scratch_store_b32 off, v95, s33
; GFX11-NEXT: v_writelane_b32 v100, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v95, off, s33
; GFX11-NEXT: scratch_load_b32 v94, off, s33 offset:4
; GFX11-NEXT: scratch_load_b32 v93, off, s33 offset:8
@@ -2518,7 +2518,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
; GFX11-LABEL: return_72xi32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_clause 0xc
+; GFX11-NEXT: s_clause 0xc ; 52-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:212
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:208
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:204
@@ -2551,23 +2551,23 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:96
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:92
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:88
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
-; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: scratch_load_b32 v23, off, s32 offset:112
; GFX11-NEXT: scratch_load_b32 v22, off, s32 offset:108
; GFX11-NEXT: scratch_load_b32 v21, off, s32 offset:104
-; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
-; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:124
; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:120
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-NEXT: s_clause 0x10
; GFX11-NEXT: scratch_load_b32 v15, off, s32 offset:144
; GFX11-NEXT: scratch_load_b32 v14, off, s32 offset:140
; GFX11-NEXT: scratch_load_b32 v13, off, s32 offset:136
-; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-NEXT: s_clause 0xd
; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160
; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:156
; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:152
@@ -2608,7 +2608,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT: s_clause 0xc
+; GFX11-NEXT: s_clause 0xc ; 52-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:164
; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:168
; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:172
@@ -2641,21 +2641,6 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX9-NEXT: s_mov_b32 s34, s32
; GFX9-NEXT: s_add_i32 s32, s32, 0x28000
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
@@ -2733,6 +2718,21 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX9-NEXT: v_mov_b32_e32 v29, 0
; GFX9-NEXT: v_mov_b32_e32 v30, 0
; GFX9-NEXT: v_mov_b32_e32 v31, 0
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v63, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37]
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:636
@@ -2914,21 +2914,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: s_mov_b32 s38, s34
; GFX10-NEXT: s_mov_b32 s34, s32
; GFX10-NEXT: s_add_i32 s32, s32, 0x14000
-; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: v_writelane_b32 v63, s30, 0
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
@@ -2971,12 +2957,11 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160
; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33
-; GFX10-NEXT: v_writelane_b32 v63, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0
; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: v_mov_b32_e32 v6, 0
; GFX10-NEXT: v_mov_b32_e32 v7, 0
@@ -3006,6 +2991,21 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: v_mov_b32_e32 v31, 0
; GFX10-NEXT: s_mov_b32 s37, return_72xi32@abs32@hi
; GFX10-NEXT: s_mov_b32 s36, return_72xi32@abs32@lo
+; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: v_writelane_b32 v63, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37]
; GFX10-NEXT: s_clause 0x28
@@ -3138,7 +3138,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152
; GFX10-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156
; GFX10-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160
-; GFX10-NEXT: s_clause 0x7
+; GFX10-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1536
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1540
; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1544
@@ -3151,7 +3151,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX10-NEXT: v_mov_b32_e32 v1, 42
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x400, v0
; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37]
-; GFX10-NEXT: s_clause 0xe
+; GFX10-NEXT: s_clause 0xe ; 60-byte Folded Reload
; GFX10-NEXT: buffer_load_dword v62, off, s[0:3], s33
; GFX10-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4
; GFX10-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8
@@ -3199,7 +3199,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX11-NEXT: s_mov_b32 s36, s34
; GFX11-NEXT: s_mov_b32 s34, s32
; GFX11-NEXT: s_addk_i32 s32, 0xa00
-; GFX11-NEXT: s_clause 0xb
+; GFX11-NEXT: s_clause 0xb ; 48-byte Folded Spill
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:44
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:40
; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:36
@@ -3341,18 +3341,18 @@ define amdgpu_gfx void @call_72xi32() #1 {
; GFX11-NEXT: s_add_i32 s2, s32, 16
; GFX11-NEXT: v_mov_b32_e32 v30, v46
; GFX11-NEXT: scratch_store_b128 off, v[32:35], s2
-; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584
+; GFX11-NEXT: s_clause 0x3 ; 64-byte Folded Reload
; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1568
; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1552
; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1536
+; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584
; GFX11-NEXT: s_add_i32 s2, s33, 0x400
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, 42
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_clause 0xb
+; GFX11-NEXT: s_clause 0xb ; 48-byte Folded Reload
; GFX11-NEXT: scratch_load_b32 v59, off, s33
; GFX11-NEXT: scratch_load_b32 v58, off, s33 offset:4
; GFX11-NEXT: scratch_load_b32 v57, off, s33 offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index f807169..93d7eeb 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -255,11 +255,11 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off offset:16
; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off
; GCN-SDAG-NEXT: global_load_b128 v[34:37], v[0:1], off offset:64
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x70
; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 48
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60
; GCN-SDAG-NEXT: v_mov_b64_e32 v[50:51], 32
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x70
; GCN-SDAG-NEXT: v_mov_b64_e32 v[64:65], 16
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60
; GCN-SDAG-NEXT: v_mov_b64_e32 v[66:67], 0
; GCN-SDAG-NEXT: v_mov_b64_e32 v[52:53], 0x50
; GCN-SDAG-NEXT: v_mov_b64_e32 v[54:55], 64
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
index 7e1055b..03b56ca 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
@@ -11,7 +11,7 @@ body: |
; CHECK-LABEL: name: mimg_nsa
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) {
; CHECK-NEXT: S_CLAUSE 1
; CHECK-NEXT: $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
; CHECK-NEXT: $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
@@ -29,7 +29,7 @@ body: |
; CHECK-LABEL: name: mimg_nsa_mixed
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 {
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) {
; CHECK-NEXT: S_CLAUSE 2
; CHECK-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx11 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
index 9689dda..68f9e83 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
@@ -10,7 +10,7 @@ body: |
; CHECK-LABEL: name: mimg
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) {
; CHECK-NEXT: S_CLAUSE 1
; CHECK-NEXT: $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
; CHECK-NEXT: $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
@@ -28,7 +28,7 @@ body: |
; CHECK-LABEL: name: mimg_mixed
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 {
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) {
; CHECK-NEXT: S_CLAUSE 2
; CHECK-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll
index 4719ab9..cbf697f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll
@@ -1,13 +1,20 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -stop-after=postrapseudos -o - < %s | FileCheck -enable-var-scope -check-prefix=MIR %s
-; MIR-LABEL: name: gws_barrier_offset0{{$}}
-; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
-; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
-; MIR-NEXT: S_WAITCNT 0
-; MIR-NEXT: }
define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
+ ; MIR-LABEL: name: gws_barrier_offset0
+ ; MIR: bb.0 (%ir-block.0):
+ ; MIR-NEXT: liveins: $sgpr8_sgpr9
+ ; MIR-NEXT: {{ $}}
+ ; MIR-NEXT: renamable $sgpr4 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset, align 16, addrspace 4)
+ ; MIR-NEXT: $m0 = S_MOV_B32 0
+ ; MIR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec
+ ; MIR-NEXT: BUNDLE implicit killed renamable $vgpr0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") {
+ ; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
+ ; MIR-NEXT: S_WAITCNT 0
+ ; MIR-NEXT: }
+ ; MIR-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
ret void
}
@@ -17,5 +24,3 @@ declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1
attributes #0 = { nounwind }
attributes #1 = { convergent inaccessiblememonly nounwind }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; MIR: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
index c5f6e2b..417b8e0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
@@ -35,7 +35,7 @@
; LOOP-NEXT: s_cbranch_scc1 [[LOOP]]
; MIR-LABEL: name: gws_barrier_offset0{{$}}
-; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
+; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec
; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
; MIR-NEXT: S_WAITCNT 0
; MIR-NEXT: }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index 4419b8c..af270e5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
@@ -13,9 +13,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
; SDAG-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SDAG-GFX11-TRUE16-NEXT: v_dot2_bf16_bf16 v0.l, s2, s3, v0.l
; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -26,9 +26,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
; SDAG-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SDAG-GFX11-FAKE16-NEXT: v_dot2_bf16_bf16 v1, s2, s3, v1
; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
index 0194d25..72b4769 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
@@ -12,9 +12,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; SDAG-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SDAG-GFX11-TRUE16-NEXT: v_dot2_f16_f16 v0.l, s2, s3, v0.l
; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -25,9 +25,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; SDAG-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SDAG-GFX11-FAKE16-NEXT: v_dot2_f16_f16 v1, s2, s3, v1
; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
@@ -38,9 +38,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; GISEL-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GISEL-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
; GISEL-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; GISEL-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GISEL-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GISEL-GFX11-TRUE16-NEXT: v_dot2_f16_f16 v0.l, s2, s3, v0.l
; GISEL-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -51,9 +51,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; GISEL-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GISEL-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
; GISEL-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; GISEL-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GISEL-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GISEL-GFX11-FAKE16-NEXT: v_dot2_f16_f16 v1, s2, s3, v1
; GISEL-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 0c1448a..1d08097 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -17,21 +17,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; SDAG-NEXT: v_mov_b32_e32 v4, 0
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
; SDAG-NEXT: v_mov_b32_e32 v5, s16
+; SDAG-NEXT: v_mov_b32_e32 v4, 0
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
@@ -43,13 +41,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
@@ -175,16 +172,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[12:13]
; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[10:11]
@@ -207,16 +203,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
-; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
-; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
@@ -520,21 +515,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1)
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
; GCN-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
; GCN-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
; GCN-NEXT: v_mov_b32_e32 v5, s16
+; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2
; GCN-NEXT: s_nop 7
; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
@@ -634,16 +627,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1)
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
-; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
-; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
; GCN-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; GCN-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
; GCN-NEXT: v_mov_b64_e32 v[22:23], s[12:13]
; GCN-NEXT: v_mov_b64_e32 v[20:21], s[10:11]
@@ -802,11 +794,11 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: v_mov_b32_e32 v12, s8
; SDAG-NEXT: v_mov_b32_e32 v13, s9
; SDAG-NEXT: v_mov_b32_e32 v14, s10
@@ -815,7 +807,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v4, s0
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
@@ -833,12 +824,11 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
@@ -965,15 +955,14 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -1003,15 +992,14 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
@@ -1317,11 +1305,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: v_mov_b32_e32 v12, s8
; SDAG-NEXT: v_mov_b32_e32 v13, s9
; SDAG-NEXT: v_mov_b32_e32 v14, s10
@@ -1330,7 +1318,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v4, s0
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
@@ -1348,12 +1335,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
@@ -1481,11 +1467,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: v_mov_b32_e32 v12, s8
; SDAG-NEXT: v_mov_b32_e32 v13, s9
; SDAG-NEXT: v_mov_b32_e32 v14, s10
@@ -1494,7 +1480,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v4, s0
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
@@ -1512,12 +1497,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
@@ -1645,11 +1629,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: v_mov_b32_e32 v12, s8
; SDAG-NEXT: v_mov_b32_e32 v13, s9
; SDAG-NEXT: v_mov_b32_e32 v14, s10
@@ -1658,7 +1642,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v4, s0
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
@@ -1676,12 +1659,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
@@ -1809,11 +1791,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: v_mov_b32_e32 v12, s8
; SDAG-NEXT: v_mov_b32_e32 v13, s9
; SDAG-NEXT: v_mov_b32_e32 v14, s10
@@ -1822,7 +1804,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v4, s0
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
@@ -1840,12 +1821,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
@@ -1972,15 +1952,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -2010,15 +1989,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
@@ -2323,15 +2301,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -2361,15 +2338,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
@@ -2674,15 +2650,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -2712,15 +2687,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
@@ -3025,15 +2999,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -3063,15 +3036,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index f93e5f0..83c240c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -10386,7 +10386,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: s_add_u32 s2, s0, 0x150
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[12:15]
-; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7]
+; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11]
+; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19]
; GFX8-NEXT: v_mov_b32_e32 v13, s3
; GFX8-NEXT: v_mov_b32_e32 v12, s2
; GFX8-NEXT: s_add_u32 s2, s0, 0x140
@@ -10395,10 +10396,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v14, s2
; GFX8-NEXT: s_add_u32 s2, s0, 0x130
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3]
-; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11]
-; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19]
-; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: v_mov_b32_e32 v17, s3
; GFX8-NEXT: v_mov_b32_e32 v16, s2
; GFX8-NEXT: s_add_u32 s2, s0, 0x120
@@ -10406,20 +10403,21 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v19, s3
; GFX8-NEXT: v_mov_b32_e32 v18, s2
; GFX8-NEXT: s_add_u32 s2, s0, 0x110
-; GFX8-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7]
; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: v_mov_b32_e32 v42, vcc_lo
; GFX8-NEXT: v_mov_b32_e32 v43, vcc_hi
; GFX8-NEXT: v_mov_b32_e32 v6, s4
; GFX8-NEXT: v_mov_b32_e32 v7, s5
+; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v8, s12
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NEXT: v_mov_b32_e32 v8, s12
-; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23]
; GFX8-NEXT: v_mov_b32_e32 v2, s10
; GFX8-NEXT: v_mov_b32_e32 v3, s11
; GFX8-NEXT: v_mov_b32_e32 v9, s13
-; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27]
; GFX8-NEXT: v_mov_b32_e32 v10, s14
; GFX8-NEXT: v_mov_b32_e32 v11, s15
; GFX8-NEXT: flat_store_dwordx4 v[56:57], v[28:31]
@@ -10588,6 +10586,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23]
+; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27]
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index bca39d0..59f4a9d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -4582,18 +4582,18 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v2
; GCN-HSA-NEXT: v_bfe_i32 v22, v3, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v20, v2, 0, 16
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
; GCN-HSA-NEXT: s_waitcnt vmcnt(11)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index e55fb2ca..7203545 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -3313,12 +3313,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27]
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27]
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v7
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v6
; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v6
@@ -3726,7 +3726,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: s_nop 0
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[20:23], 0 ; 4-byte Folded Reload
; GCN-GFX900-HSA-NEXT: s_nop 0
; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
@@ -3740,7 +3739,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v43, v26
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v29, v27
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v31, v28
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(12)
+; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(11)
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v3
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v2
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v1
@@ -3749,6 +3748,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v27, v1
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v57, v2
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v59, v3
+; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7)
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v24
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v23
@@ -3758,7 +3758,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v21
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, v21
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v10, v22
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(0)
+; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(1)
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:208
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[41:44], s[0:1] offset:160
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[29:32], s[0:1] offset:176
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index f879dc6..cb17f01 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -7788,19 +7788,18 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v13, 16, 8
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v35, v14, 8, 8
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xff, v14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v53
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v53
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v16, v17, 8, 8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v53
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xff, v17
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 24, v17
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v52, v17, 16, 8
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v53
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v53
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v53
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v53
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v53
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v53
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v53
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v53
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v53
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:128
@@ -7810,7 +7809,7 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:64
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT: s_nop 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v53
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index bd191a3..062a985 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -3172,27 +3172,25 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32:
; VI-NO-DS128: ; %bb.0:
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
; VI-NO-DS128-NEXT: s_mov_b32 s90, -1
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, s1
; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v16 offset1:1
; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:2 offset1:3
+; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5
; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000
; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11
-; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11
-; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11
-; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
-; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v18
+; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11
; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10
; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13
; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12
@@ -3200,7 +3198,6 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v20
; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v18
; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v17
-; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5
; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v20
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v19
; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v19
@@ -3243,17 +3240,19 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v56, 16, v19
; VI-NO-DS128-NEXT: v_and_b32_e32 v55, 0xffff, v19
; VI-NO-DS128-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
+; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v54, 16, v20
; VI-NO-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v20
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22
-; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v19
; VI-NO-DS128-NEXT: v_and_b32_e32 v19, 0xffff, v19
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v18
; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v18
; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, s0
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22
+; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v21
; VI-NO-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v21
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v60, 16, v24
@@ -3296,21 +3295,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v56, s1
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v56 offset1:1
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v56 offset0:2 offset1:3
-; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11
-; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0
-; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11
-; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11
-; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
-; GFX9-NO-DS128-NEXT: s_nop 0
-; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[18:21], v56 offset0:4 offset1:5
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[22:25], v56 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11
+; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
+; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2)
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v15
+; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12
@@ -3337,9 +3332,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v22
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:8 offset1:9
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0
+; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT: s_nop 0
+; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25
-; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v17
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v40, 0xffff, v17
@@ -3360,16 +3357,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v55, 16, v22
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v54, 0xffff, v22
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25
+; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v24
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17
-; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v63, 16, v23
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v62, 0xffff, v23
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v22
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17
+; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v19
@@ -3806,9 +3804,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16
; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000
; VI-DS128-NEXT: s_add_u32 s88, s88, s11
-; VI-DS128-NEXT: s_addc_u32 s89, s89, 0
-; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
+; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
+; VI-DS128-NEXT: s_waitcnt lgkmcnt(3)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11
+; VI-DS128-NEXT: s_addc_u32 s89, s89, 0
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10
; VI-DS128-NEXT: v_mov_b32_e32 v4, v3
; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11
@@ -3825,23 +3825,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
; VI-DS128-NEXT: buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18
-; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19
-; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18
-; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
-; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill
-; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
-; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17
; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16
+; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19
+; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18
; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17
; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16
-; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
+; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23
; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22
; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21
@@ -3850,21 +3843,25 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22
; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21
; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20
-; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27
; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26
; VI-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25
; VI-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24
; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27
+; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26
; VI-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25
; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
+; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39
; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27
; VI-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26
@@ -3875,16 +3872,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25
; VI-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36
; VI-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39
; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38
-; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25
; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24
; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25
; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24
; VI-DS128-NEXT: v_mov_b32_e32 v24, s0
+; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37
; VI-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36
; VI-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58
; VI-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57
@@ -3943,9 +3941,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v0
; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16
; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11
-; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
-; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
+; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
+; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11
+; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10
; GFX9-DS128-NEXT: v_mov_b32_e32 v4, v3
; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11
@@ -3964,24 +3964,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18
-; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19
-; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18
-; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
-; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: s_nop 0
-; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
-; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16
+; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19
+; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18
; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17
; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16
-; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
+; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21
@@ -3990,21 +3982,26 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22
; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21
; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20
-; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24
; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27
+; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26
; GFX9-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25
; GFX9-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
+; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: s_nop 0
+; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26
@@ -4015,16 +4012,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25
; GFX9-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36
; GFX9-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39
; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38
-; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24
; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25
; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24
; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0
+; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37
; GFX9-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57
@@ -4197,29 +4195,20 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32:
; VI-NO-DS128: ; %bb.0:
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
; VI-NO-DS128-NEXT: s_mov_b32 s90, -1
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-NO-DS128-NEXT: v_mov_b32_e32 v28, s1
+; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
+; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
+; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1
; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000
; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11
-; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
-; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
-; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
-; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
-; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
-; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
-; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30
; VI-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29
@@ -4229,7 +4218,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31
; VI-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16
; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34
; VI-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33
@@ -4247,16 +4236,24 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31
; VI-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16
; VI-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
+; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
+; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
+; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32
; VI-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
-; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31
; VI-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30
; VI-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16
; VI-NO-DS128-NEXT: v_mov_b32_e32 v30, s0
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
+; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
; VI-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16
; VI-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14
@@ -4316,23 +4313,14 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: s_mov_b32 s15, 0xe00000
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v28, s1
+; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11
; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0
-; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
-; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
-; GFX9-NO-DS128-NEXT: s_nop 0
-; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
-; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
-; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
-; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30
; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29
@@ -4342,7 +4330,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31
; GFX9-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
-; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34
; GFX9-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33
@@ -4360,16 +4348,24 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31
; GFX9-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
+; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT: s_nop 0
+; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32
; GFX9-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31
; GFX9-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30
; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v30, s0
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16
; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14
@@ -4857,10 +4853,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: v_mov_b32_e32 v32, s1
; VI-DS128-NEXT: ds_read_b128 v[8:11], v32
; VI-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16
+; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
+; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000
; VI-DS128-NEXT: s_add_u32 s88, s88, s11
; VI-DS128-NEXT: s_addc_u32 s89, s89, 0
-; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT: s_waitcnt lgkmcnt(3)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10
; VI-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16
@@ -4873,12 +4871,6 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8
; VI-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16
-; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
-; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19
; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18
@@ -4899,8 +4891,11 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64
; VI-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80
; VI-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96
+; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39
; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38
@@ -4913,14 +4908,15 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16
; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
; VI-DS128-NEXT: v_mov_b32_e32 v32, s0
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
; VI-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
; VI-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34
; VI-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33
; VI-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16
@@ -4985,9 +4981,11 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s1
; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v32
; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16
+; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
+; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11
; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
-; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3)
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10
; GFX9-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16
@@ -5001,13 +4999,6 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8
; GFX9-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16
-; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: s_nop 0
-; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
-; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18
@@ -5028,8 +5019,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64
; GFX9-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80
; GFX9-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96
+; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: s_nop 0
+; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38
@@ -5042,14 +5037,15 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16
; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
; GFX9-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33
; GFX9-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
index 1d1d3e4..9da7a79 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
@@ -15,24 +15,23 @@ define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addr
; GFX12-NEXT: s_mov_b32 s9, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s6, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_mov_b32 s8, s1
; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX12-NEXT: s_mov_b32 s13, s2
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-NEXT: s_mov_b32 s5, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s4, s3
; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
+; GFX12-NEXT: s_mov_b32 s4, s3
+; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX12-NEXT: s_mov_b32 s13, s2
; GFX12-NEXT: s_mov_b32 s2, s1
-; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -63,10 +62,10 @@ define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addr
; GFX12-NEXT: s_mov_b32 s13, s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
+; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
; GFX12-NEXT: s_mov_b32 s5, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s4, s3
@@ -100,25 +99,24 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i
; GFX12-NEXT: s_mov_b32 s9, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s6, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_mov_b32 s8, s1
; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX12-NEXT: s_mov_b32 s13, s2
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-NEXT: s_mov_b32 s5, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s4, s3
; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, s3
+; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX12-NEXT: s_mov_b32 s13, s2
; GFX12-NEXT: s_mov_b32 s2, s1
-; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen
@@ -141,24 +139,23 @@ define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7)
; GFX12-NEXT: s_mov_b32 s9, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s6, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_mov_b32 s8, s1
; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX12-NEXT: s_mov_b32 s13, s2
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-NEXT: s_mov_b32 s5, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s4, s3
; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
+; GFX12-NEXT: s_mov_b32 s4, s3
+; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX12-NEXT: s_mov_b32 s13, s2
; GFX12-NEXT: s_mov_b32 s2, s1
-; GFX12-NEXT: s_mov_b32 s3, s12
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index fc36ed9..84db54c 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -128,10 +128,10 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11]
; GFX10-SDAG-NEXT: s_mov_b32 s11, s2
; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11]
-; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen slc
; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30
; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen slc
; GFX10-SDAG-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SDAG-NEXT: s_mov_b32 s5, s10
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
@@ -181,24 +181,23 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX11-SDAG-NEXT: s_mov_b32 s9, s12
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_mov_b32 s6, s3
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX11-SDAG-NEXT: s_mov_b32 s8, s1
; GFX11-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX11-SDAG-NEXT: s_mov_b32 s5, s12
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s3
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
; GFX11-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
; GFX11-SDAG-NEXT: s_mov_b32 s2, s1
-; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -215,12 +214,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX11-GISEL-NEXT: s_mov_b32 s8, s1
; GFX11-GISEL-NEXT: s_mov_b32 s9, s2
; GFX11-GISEL-NEXT: s_mov_b32 s10, s3
-; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
; GFX11-GISEL-NEXT: s_mov_b32 s4, s1
; GFX11-GISEL-NEXT: s_mov_b32 s5, s2
; GFX11-GISEL-NEXT: s_mov_b32 s6, s3
@@ -239,24 +238,23 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX12-SDAG-NEXT: s_mov_b32 s9, s12
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_mov_b32 s6, s3
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: s_mov_b32 s8, s1
; GFX12-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-SDAG-NEXT: s_mov_b32 s5, s12
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_mov_b32 s4, s3
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
+; GFX12-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
; GFX12-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
; GFX12-SDAG-NEXT: s_mov_b32 s2, s1
-; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
@@ -273,12 +271,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX12-GISEL-NEXT: s_mov_b32 s8, s1
; GFX12-GISEL-NEXT: s_mov_b32 s9, s2
; GFX12-GISEL-NEXT: s_mov_b32 s10, s3
-; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
; GFX12-GISEL-NEXT: s_mov_b32 s4, s1
; GFX12-GISEL-NEXT: s_mov_b32 s5, s2
; GFX12-GISEL-NEXT: s_mov_b32 s6, s3
@@ -413,11 +411,11 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11]
; GFX10-SDAG-NEXT: s_mov_b32 s11, s2
; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11]
-; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30
; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SDAG-NEXT: s_mov_b32 s5, s10
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
@@ -468,25 +466,24 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX11-SDAG-NEXT: s_mov_b32 s9, s12
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_mov_b32 s6, s3
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX11-SDAG-NEXT: s_mov_b32 s8, s1
; GFX11-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX11-SDAG-NEXT: s_mov_b32 s5, s12
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s3
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
; GFX11-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
; GFX11-SDAG-NEXT: s_mov_b32 s2, s1
-; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX11-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
@@ -503,13 +500,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX11-GISEL-NEXT: s_mov_b32 s8, s1
; GFX11-GISEL-NEXT: s_mov_b32 s9, s2
; GFX11-GISEL-NEXT: s_mov_b32 s10, s3
-; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_mov_b32 s4, s1
; GFX11-GISEL-NEXT: s_mov_b32 s5, s2
; GFX11-GISEL-NEXT: s_mov_b32 s6, s3
@@ -528,25 +525,24 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX12-SDAG-NEXT: s_mov_b32 s9, s12
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_mov_b32 s6, s3
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: s_mov_b32 s8, s1
; GFX12-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-SDAG-NEXT: s_mov_b32 s5, s12
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_mov_b32 s4, s3
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
; GFX12-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
; GFX12-SDAG-NEXT: s_mov_b32 s2, s1
-; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
@@ -563,13 +559,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX12-GISEL-NEXT: s_mov_b32 s8, s1
; GFX12-GISEL-NEXT: s_mov_b32 s9, s2
; GFX12-GISEL-NEXT: s_mov_b32 s10, s3
-; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: s_mov_b32 s4, s1
; GFX12-GISEL-NEXT: s_mov_b32 s5, s2
; GFX12-GISEL-NEXT: s_mov_b32 s6, s3
diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll
index ae08054..ba53294 100644
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -774,9 +774,9 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
; GFX1250-NEXT: s_load_b32 s6, s[0:1], 0x0
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_max_u32_e32 v0, s6, v0
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index 02f39e2..888a458 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -714,7 +714,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1
; ALIGNED-NEXT: s_cbranch_vccnz .LBB0_1
; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32
; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4
; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -1468,7 +1468,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
; ALIGNED-NEXT: global_store_byte v[16:17], v4, off offset:1
; ALIGNED-NEXT: s_cbranch_vccnz .LBB1_1
; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32
; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4
; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -1854,6 +1854,10 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220
; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212
; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142
; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140
@@ -1862,10 +1866,6 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130
; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128
-; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26
; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139
; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137
; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141
@@ -1901,14 +1901,6 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60
; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52
; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110
-; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108
-; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102
-; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98
-; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96
; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6
@@ -1923,6 +1915,14 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25
; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19
; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110
+; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108
+; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102
+; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98
+; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96
; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11
; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103
; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7
@@ -3438,7 +3438,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x100, v0
; ALIGNED-NEXT: s_cbranch_vccnz .LBB3_1
; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT: s_clause 0x2f
+; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32
; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4
; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8
@@ -3741,23 +3741,23 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill
; ALIGNED-NEXT: .LBB4_1: ; %load-store-loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT: s_clause 0x39
+; ALIGNED-NEXT: s_clause 0x3e
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
-; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
-; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:19
; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30
; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32
; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33
; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35
; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36
; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37
@@ -3779,17 +3779,17 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:53
; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54
; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55
-; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56
-; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57
-; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:60
; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:61
; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:62
; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:64
; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:65
; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:66
-; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:67
; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:68
; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:69
@@ -3797,57 +3797,96 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71
; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76
; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77
-; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78
; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT: s_waitcnt vmcnt(57)
+; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT: s_clause 0x33
+; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(56)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(55)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(54)
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(53)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(52)
-; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(51)
-; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(49)
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(48)
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(47)
; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(46)
; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(45)
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5
-; ALIGNED-NEXT: s_waitcnt vmcnt(42)
-; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8
-; ALIGNED-NEXT: s_waitcnt vmcnt(40)
; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6
; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12
; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14
; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17
-; ALIGNED-NEXT: s_waitcnt vmcnt(39)
; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13
-; ALIGNED-NEXT: s_waitcnt vmcnt(37)
; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18
-; ALIGNED-NEXT: s_waitcnt vmcnt(35)
; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22
-; ALIGNED-NEXT: s_waitcnt vmcnt(33)
; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25
-; ALIGNED-NEXT: s_waitcnt vmcnt(31)
; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(29)
; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5
@@ -3856,76 +3895,83 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11
; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13
; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(27)
; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(25)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(23)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(21)
; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v35
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(16)
; ALIGNED-NEXT: v_lshl_or_b32 v4, v50, 8, v38
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(14)
; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(10)
; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52
; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 16, v4
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 16, v6
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(10)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(8)
; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v54
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v65
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(5)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
+; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
@@ -3934,52 +3980,82 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247
; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
-; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(42)
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
+; ALIGNED-NEXT: s_waitcnt vmcnt(41)
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(39)
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: s_waitcnt vmcnt(38)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: s_waitcnt vmcnt(36)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: s_waitcnt vmcnt(35)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: s_waitcnt vmcnt(8)
+; ALIGNED-NEXT: v_lshl_or_b32 v73, v13, 8, v16
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
@@ -4251,259 +4327,132 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v102, 8, v101
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT: s_clause 0x3
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: v_lshl_or_b32 v77, v9, 8, v10
+; ALIGNED-NEXT: s_waitcnt vmcnt(5)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v127, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v113, 8, v116
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v123, 8, v125
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v110
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v94
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v104
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v108
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v90
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v88
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v76
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 8, v62
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v57
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v47, 8, v58
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v42, 8, v44
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v119, 8, v41
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v118
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x3e
-; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223
-; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210
-; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224
-; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225
-; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226
-; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227
-; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228
-; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229
-; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230
-; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235
-; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234
-; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240
-; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241
-; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242
-; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243
-; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244
-; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245
-; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246
-; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247
-; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252
-; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT: s_clause 0x6
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250
-; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen
-; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2
-; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7
-; ALIGNED-NEXT: s_waitcnt vmcnt(62)
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v113, 8, v116
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v102, 8, v101
; ALIGNED-NEXT: v_lshl_or_b32 v106, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v103, 8, v114
; ALIGNED-NEXT: v_lshl_or_b32 v4, v100, 8, v112
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v123, 8, v125
; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3
-; ALIGNED-NEXT: s_waitcnt vmcnt(60)
; ALIGNED-NEXT: v_lshl_or_b32 v3, v97, 8, v98
-; ALIGNED-NEXT: s_waitcnt vmcnt(58)
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v94
; ALIGNED-NEXT: v_lshl_or_b32 v4, v87, 8, v96
-; ALIGNED-NEXT: s_waitcnt vmcnt(14)
-; ALIGNED-NEXT: v_lshl_or_b32 v73, v13, 8, v16
-; ALIGNED-NEXT: s_waitcnt vmcnt(10)
-; ALIGNED-NEXT: v_lshl_or_b32 v77, v9, 8, v10
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v110
; ALIGNED-NEXT: v_lshl_or_b32 v89, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v84, 8, v86
; ALIGNED-NEXT: v_lshl_or_b32 v4, v85, 8, v83
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v104
; ALIGNED-NEXT: v_lshl_or_b32 v74, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v81
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v108
; ALIGNED-NEXT: v_lshl_or_b32 v4, v65, 8, v71
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v88
; ALIGNED-NEXT: v_lshl_or_b32 v46, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v80
; ALIGNED-NEXT: v_lshl_or_b32 v4, v54, 8, v53
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v90
; ALIGNED-NEXT: v_lshl_or_b32 v117, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v70
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63
; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v64
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v76
; ALIGNED-NEXT: v_lshl_or_b32 v115, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v50
; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v48
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v57
; ALIGNED-NEXT: v_lshl_or_b32 v99, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v38
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 8, v62
; ALIGNED-NEXT: v_lshl_or_b32 v4, v32, 8, v33
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v82, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v37
; ALIGNED-NEXT: v_lshl_or_b32 v4, v31, 8, v34
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v47, 8, v58
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61
; ALIGNED-NEXT: v_lshl_or_b32 v66, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v29, 8, v30
; ALIGNED-NEXT: v_lshl_or_b32 v4, v26, 8, v28
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v42, 8, v44
; ALIGNED-NEXT: v_lshl_or_b32 v51, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v24
; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v21
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45
; ALIGNED-NEXT: v_lshl_or_b32 v49, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v18, 8, v20
; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v15
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v118
; ALIGNED-NEXT: v_lshl_or_b32 v27, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v19
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v119, 8, v41
; ALIGNED-NEXT: v_lshl_or_b32 v22, v73, 16, v4
; ALIGNED-NEXT: v_lshl_or_b32 v73, v11, 8, v12
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v4, v77, 16, v73
; ALIGNED-NEXT: v_lshl_or_b32 v73, v6, 8, v8
; ALIGNED-NEXT: v_lshl_or_b32 v77, v7, 8, v5
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen
+; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v3, v77, 16, v73
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:1
; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:3
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -4513,46 +4462,44 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73
; ALIGNED-NEXT: v_lshl_or_b32 v73, v109, 8, v107
; ALIGNED-NEXT: v_lshl_or_b32 v77, v1, 8, v120
+; ALIGNED-NEXT: s_clause 0x2
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10
; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73
; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:13
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v73, v73, 8, v1
+; ALIGNED-NEXT: v_mov_b32_e32 v1, v107
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15
-; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: v_lshl_or_b32 v77, v107, 8, v0
-; ALIGNED-NEXT: v_mov_b32_e32 v1, v107
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8
-; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9
-; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: v_lshl_or_b32 v73, v120, 8, v122
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: v_lshl_or_b32 v77, v121, 8, v109
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x2
; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:18
; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:16
; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236
; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228
; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224
-; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v77
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -4625,6 +4572,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:220
; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:212
; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload
; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:202
; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:203
; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:201
@@ -4641,7 +4589,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:198
; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:196
; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:192
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6
@@ -4656,6 +4603,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload
; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:186
; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:187
; ALIGNED-NEXT: flat_store_byte v[3:4], v119 offset:185
@@ -4672,7 +4620,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:182
; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:180
; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:176
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload
@@ -4684,6 +4631,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload
; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:170
; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:171
; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:169
@@ -4700,7 +4648,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:166
; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:164
; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:160
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload
@@ -4712,11 +4659,11 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:154
; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155
; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:153
; ALIGNED-NEXT: flat_store_byte v[3:4], v127 offset:159
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload
@@ -5234,9 +5181,9 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:10
; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:11
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13
; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:9
@@ -5274,7 +5221,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0
; ALIGNED-NEXT: s_cbranch_vccnz .LBB4_1
; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT: s_clause 0x2f
+; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32
; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4
; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8
@@ -6797,7 +6744,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; ALIGNED-NEXT: s_cbranch_scc0 .LBB5_5
; ALIGNED-NEXT: .LBB5_6: ; %Flow6
; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32
; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4
; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -8296,7 +8243,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
; ALIGNED-NEXT: s_cbranch_scc0 .LBB6_5
; ALIGNED-NEXT: .LBB6_6: ; %Flow8
; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32
; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4
; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -8848,14 +8795,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60
; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52
; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110
-; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108
-; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102
-; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98
-; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96
; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:111
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v6
@@ -8871,6 +8810,14 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23
; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v19
; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110
+; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108
+; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102
+; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98
+; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96
; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11
; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:103
; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v7
@@ -9297,6 +9244,10 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:476
; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:468
; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142
; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140
@@ -9305,10 +9256,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130
; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128
-; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26
; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139
; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137
; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141
@@ -9344,14 +9291,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316
; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308
; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110
-; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108
-; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102
-; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98
-; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96
; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6
@@ -9366,6 +9305,14 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25
; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19
; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110
+; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108
+; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102
+; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98
+; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96
; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11
; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103
; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7
@@ -12198,7 +12145,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_cbranch_scc0 .LBB8_5
; ALIGNED-NEXT: .LBB8_6: ; %Flow19
; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; ALIGNED-NEXT: s_clause 0x2f
+; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32
; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4
; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8
@@ -12645,6 +12592,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-LABEL: memmove_p0_p5_sz2048:
; ALIGNED: ; %bb.0: ; %entry
; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT: s_mov_b32 s6, exec_lo
; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
@@ -12693,34 +12645,29 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_mov_b64 s[4:5], 0
-; ALIGNED-NEXT: s_mov_b32 s6, exec_lo
; ALIGNED-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; ALIGNED-NEXT: v_cmpx_ge_u32_e64 v2, v0
; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s6
; ALIGNED-NEXT: s_cbranch_execz .LBB9_2
; ALIGNED-NEXT: .LBB9_1: ; %memmove_fwd_loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT: s_clause 0x39
+; ALIGNED-NEXT: s_clause 0x3e
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20
+; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
-; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
+; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30
; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31
; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32
; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33
; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35
; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36
; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37
@@ -12742,17 +12689,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:53
; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:54
; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60
; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:56
; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57
; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
-; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60
+; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:61
; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:62
; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63
; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64
; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65
; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:66
-; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:67
; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:68
; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:69
@@ -12760,58 +12707,94 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71
; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76
; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77
-; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78
; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT: s_waitcnt vmcnt(57)
+; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT: s_clause 0x30
+; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(56)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(55)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(54)
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(53)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(52)
; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(51)
; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(49)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(48)
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(47)
; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(46)
; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(45)
; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5
-; ALIGNED-NEXT: s_waitcnt vmcnt(42)
-; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8
-; ALIGNED-NEXT: s_waitcnt vmcnt(40)
; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8
; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6
; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12
; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14
; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17
-; ALIGNED-NEXT: s_waitcnt vmcnt(39)
; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13
-; ALIGNED-NEXT: s_waitcnt vmcnt(37)
; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18
-; ALIGNED-NEXT: s_waitcnt vmcnt(35)
; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22
-; ALIGNED-NEXT: s_waitcnt vmcnt(33)
; ALIGNED-NEXT: v_lshl_or_b32 v12, v27, 8, v25
-; ALIGNED-NEXT: s_waitcnt vmcnt(31)
; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(29)
; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5
; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7
@@ -12819,82 +12802,81 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11
; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13
; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(27)
; ALIGNED-NEXT: v_lshl_or_b32 v15, v30, 8, v29
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(25)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v32, 8, v34
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(23)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v36, 8, v31
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(21)
; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v33
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(16)
; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v37
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(14)
; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v38
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v39
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(10)
; ALIGNED-NEXT: v_lshl_or_b32 v7, v51, 8, v52
; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 16, v4
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 16, v6
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v54, 8, v53
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(10)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v55, 8, v65
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(8)
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v64
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
+; ALIGNED-NEXT: s_waitcnt vmcnt(61)
; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v67
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
+; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
+; ALIGNED-NEXT: s_waitcnt vmcnt(61)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(5)
+; ALIGNED-NEXT: s_waitcnt vmcnt(61)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
+; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210
; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
@@ -12902,47 +12884,97 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0xc
+; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251
; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
-; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(44)
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(5)
+; ALIGNED-NEXT: s_waitcnt vmcnt(43)
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(42)
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: s_waitcnt vmcnt(41)
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(40)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: s_waitcnt vmcnt(33)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: s_waitcnt vmcnt(32)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: s_waitcnt vmcnt(16)
+; ALIGNED-NEXT: v_lshl_or_b32 v95, v16, 8, v20
+; ALIGNED-NEXT: s_waitcnt vmcnt(12)
+; ALIGNED-NEXT: v_lshl_or_b32 v109, v11, 8, v12
+; ALIGNED-NEXT: s_waitcnt vmcnt(10)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
@@ -13214,289 +13246,158 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v114, 8, v113
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v117, 8, v40
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v121
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 8, v120
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v105, 8, v104
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v107
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v111
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v92
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v88, 8, v90
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 8, v79
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v75
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v61
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v59, 8, v62
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v57
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v56
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v42
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x3e
-; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223
-; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210
-; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224
-; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225
-; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226
-; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227
-; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228
-; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229
-; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230
-; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235
-; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234
-; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240
-; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241
-; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242
-; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243
-; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244
-; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245
-; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246
-; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247
-; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252
-; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT: s_clause 0x5
-; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2
-; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3
-; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6
-; ALIGNED-NEXT: s_waitcnt vmcnt(62)
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v117, 8, v40
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v114, 8, v113
; ALIGNED-NEXT: v_lshl_or_b32 v110, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v115, 8, v118
-; ALIGNED-NEXT: s_waitcnt vmcnt(61)
; ALIGNED-NEXT: v_lshl_or_b32 v4, v112, 8, v116
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123
; ALIGNED-NEXT: v_lshl_or_b32 v93, v4, 16, v3
-; ALIGNED-NEXT: s_waitcnt vmcnt(59)
; ALIGNED-NEXT: v_lshl_or_b32 v3, v101, 8, v102
-; ALIGNED-NEXT: s_waitcnt vmcnt(57)
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v105, 8, v104
; ALIGNED-NEXT: v_lshl_or_b32 v4, v99, 8, v100
-; ALIGNED-NEXT: s_waitcnt vmcnt(13)
-; ALIGNED-NEXT: v_lshl_or_b32 v95, v16, 8, v20
-; ALIGNED-NEXT: s_waitcnt vmcnt(9)
-; ALIGNED-NEXT: v_lshl_or_b32 v109, v11, 8, v12
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 8, v120
; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v96, 8, v98
; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v87
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v107
; ALIGNED-NEXT: v_lshl_or_b32 v77, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v85
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v111
; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v83
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v88, 8, v90
; ALIGNED-NEXT: v_lshl_or_b32 v58, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v84
; ALIGNED-NEXT: v_lshl_or_b32 v4, v66, 8, v65
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v92
; ALIGNED-NEXT: v_lshl_or_b32 v41, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v82
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v75
; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v68
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 8, v79
; ALIGNED-NEXT: v_lshl_or_b32 v119, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54
; ALIGNED-NEXT: v_lshl_or_b32 v4, v80, 8, v52
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v61
; ALIGNED-NEXT: v_lshl_or_b32 v103, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v50
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74
; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v37
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v86, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v49
; ALIGNED-NEXT: v_lshl_or_b32 v4, v35, 8, v38
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v59, 8, v62
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73
; ALIGNED-NEXT: v_lshl_or_b32 v70, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v34
; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v32
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v56
; ALIGNED-NEXT: v_lshl_or_b32 v55, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v28
; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v25
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v57
; ALIGNED-NEXT: v_lshl_or_b32 v53, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v24
; ALIGNED-NEXT: v_lshl_or_b32 v4, v18, 8, v17
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v42
; ALIGNED-NEXT: v_lshl_or_b32 v31, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v4, v19, 8, v23
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen
; ALIGNED-NEXT: v_lshl_or_b32 v26, v95, 16, v4
; ALIGNED-NEXT: v_lshl_or_b32 v95, v13, 8, v14
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7
-; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v21, v109, 16, v95
; ALIGNED-NEXT: v_lshl_or_b32 v95, v8, 8, v10
; ALIGNED-NEXT: v_lshl_or_b32 v109, v9, 8, v7
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v15, v109, 16, v95
; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:1
-; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v95, v95, 8, v3
; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95
; ALIGNED-NEXT: v_lshl_or_b32 v95, v5, 8, v125
; ALIGNED-NEXT: v_lshl_or_b32 v109, v4, 8, v6
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:15
; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v6
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:10
; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
@@ -13509,19 +13410,19 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v0
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x2
; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:18
; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:16
; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232
; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236
; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224
-; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: s_clause 0x1 ; 8-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704
; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708
-; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v109
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
@@ -13590,6 +13491,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:220
; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:212
; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload
; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:202
; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:203
; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:201
@@ -13606,8 +13509,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:198
; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:196
; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:192
-; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800
@@ -13622,6 +13523,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:186
; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:187
; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:185
@@ -13638,7 +13540,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:182
; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:180
; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:176
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
@@ -13650,6 +13551,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:170
; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:171
; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:169
@@ -13666,7 +13568,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:166
; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:164
; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:160
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
@@ -13678,11 +13579,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:154
; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155
; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:153
; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:159
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload
@@ -14200,9 +14101,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:10
; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:11
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload
@@ -14253,23 +14154,23 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_mov_b32 s7, -1
; ALIGNED-NEXT: .LBB9_4: ; %memmove_bwd_loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT: s_clause 0x39
+; ALIGNED-NEXT: s_clause 0x3e
; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:20
; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21
; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22
; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:23
-; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24
-; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25
-; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26
; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:19
; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28
; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:29
; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:30
; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:32
; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:33
; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:35
; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:36
; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:37
@@ -14291,17 +14192,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:53
; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:54
; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:55
-; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56
-; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57
-; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58
; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:60
; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:61
; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:62
; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:64
; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:65
; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:66
-; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:67
; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:68
; ALIGNED-NEXT: buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:69
@@ -14309,57 +14210,97 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:71
; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:76
; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:77
-; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75
; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78
; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT: s_waitcnt vmcnt(57)
+; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT: buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT: s_clause 0x34
+; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT: buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT: buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(56)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(55)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(54)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(53)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(52)
-; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(51)
-; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(49)
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(48)
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(47)
; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(46)
; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(45)
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v5
-; ALIGNED-NEXT: s_waitcnt vmcnt(42)
-; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v8
-; ALIGNED-NEXT: s_waitcnt vmcnt(40)
; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6
; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12
; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14
; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17
-; ALIGNED-NEXT: s_waitcnt vmcnt(39)
; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13
-; ALIGNED-NEXT: s_waitcnt vmcnt(37)
; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18
-; ALIGNED-NEXT: s_waitcnt vmcnt(35)
; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22
-; ALIGNED-NEXT: s_waitcnt vmcnt(33)
; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25
-; ALIGNED-NEXT: s_waitcnt vmcnt(31)
; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(29)
; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5
@@ -14368,75 +14309,88 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11
; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13
; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(27)
; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(25)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(23)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(21)
; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v35
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(16)
; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v38
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(14)
; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(10)
; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52
; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15
; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; ALIGNED-NEXT: v_lshl_or_b32 v2, v5, 16, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87
; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 16, v6
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(10)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(8)
; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v54
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v65
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86
; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(5)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: s_clause 0x5
; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210
; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
@@ -14445,52 +14399,83 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247
; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87
-; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(51)
+; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(50)
+; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(49)
+; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(48)
+; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(47)
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
+; ALIGNED-NEXT: s_waitcnt vmcnt(46)
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(44)
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: s_waitcnt vmcnt(43)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: s_waitcnt vmcnt(41)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: s_waitcnt vmcnt(35)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v2
; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:81
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:80
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v62, v12, 8, v16
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
@@ -14763,8 +14748,15 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v101
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:251
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2
; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158
@@ -14772,250 +14764,110 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
+; ALIGNED-NEXT: v_lshl_or_b32 v76, v8, 8, v10
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v112, 8, v115
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v123
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v107
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v78
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v117
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x3e
-; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT: buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT: buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223
-; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210
-; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224
-; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225
-; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226
-; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227
-; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228
-; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229
-; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230
-; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235
-; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234
-; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240
-; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241
-; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242
-; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243
-; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244
-; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245
-; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246
-; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247
-; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:252
-; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT: s_clause 0x6
-; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250
-; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen
-; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2
-; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6
-; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7
-; ALIGNED-NEXT: s_waitcnt vmcnt(62)
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v112, 8, v115
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v101
; ALIGNED-NEXT: v_lshl_or_b32 v106, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v103, 8, v113
; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v102
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v123
; ALIGNED-NEXT: v_lshl_or_b32 v90, v3, 16, v2
-; ALIGNED-NEXT: s_waitcnt vmcnt(60)
; ALIGNED-NEXT: v_lshl_or_b32 v2, v87, 8, v97
-; ALIGNED-NEXT: s_waitcnt vmcnt(58)
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93
; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v96
-; ALIGNED-NEXT: s_waitcnt vmcnt(14)
-; ALIGNED-NEXT: v_lshl_or_b32 v62, v12, 8, v16
-; ALIGNED-NEXT: s_waitcnt vmcnt(10)
-; ALIGNED-NEXT: v_lshl_or_b32 v76, v8, 8, v10
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108
; ALIGNED-NEXT: v_lshl_or_b32 v88, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v83, 8, v84
; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v82
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94
; ALIGNED-NEXT: v_lshl_or_b32 v73, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v68, 8, v80
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v107
; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v70
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v78
; ALIGNED-NEXT: v_lshl_or_b32 v45, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v71
; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v53
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89
; ALIGNED-NEXT: v_lshl_or_b32 v116, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v55, 8, v69
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63
; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74
; ALIGNED-NEXT: v_lshl_or_b32 v114, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v38, 8, v50
; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v39
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47
; ALIGNED-NEXT: v_lshl_or_b32 v98, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v35, 8, v37
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61
; ALIGNED-NEXT: v_lshl_or_b32 v3, v32, 8, v31
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58
; ALIGNED-NEXT: v_lshl_or_b32 v81, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v36
; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v34
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60
; ALIGNED-NEXT: v_lshl_or_b32 v64, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v27, 8, v29
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42
; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44
; ALIGNED-NEXT: v_lshl_or_b32 v49, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v22, 8, v24
; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v21
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v117
; ALIGNED-NEXT: v_lshl_or_b32 v48, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v17, 8, v19
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119
; ALIGNED-NEXT: v_lshl_or_b32 v3, v14, 8, v13
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250
; ALIGNED-NEXT: v_lshl_or_b32 v25, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v3, v15, 8, v18
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen
; ALIGNED-NEXT: v_lshl_or_b32 v20, v62, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v62, v9, 8, v11
; ALIGNED-NEXT: v_lshl_or_b32 v3, v76, 16, v62
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
; ALIGNED-NEXT: v_lshl_or_b32 v62, v5, 8, v6
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: v_lshl_or_b32 v76, v7, 8, v1
; ALIGNED-NEXT: v_lshl_or_b32 v2, v76, 16, v62
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:1
; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
@@ -15027,48 +14879,47 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62
; ALIGNED-NEXT: v_lshl_or_b32 v62, v109, 8, v104
; ALIGNED-NEXT: v_lshl_or_b32 v76, v122, 8, v110
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13
-; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:15
; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:11
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v62, v127, 8, v0
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v76, v104, 8, v76
-; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x2
; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:8
; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:9
; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v62, v110, 8, v122
+; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v76, v104, 8, v76
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: v_lshl_or_b32 v62, v127, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62
+; ALIGNED-NEXT: v_lshl_or_b32 v62, v110, 8, v122
; ALIGNED-NEXT: v_lshl_or_b32 v76, v120, 8, v109
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x2
; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:18
; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:16
; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0xffffff00, v4
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492
; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:484
; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480
-; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: s_clause 0x1 ; 8-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704
; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708
-; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0xffffff00, v4
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v125, 8, v62
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
@@ -15137,6 +14988,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:476
; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:468
; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT: v_lshl_or_b32 v125, v0, 16, v125
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
; ALIGNED-NEXT: flat_store_byte v[2:3], v82 offset:202
; ALIGNED-NEXT: flat_store_byte v[2:3], v85 offset:203
; ALIGNED-NEXT: flat_store_byte v[2:3], v83 offset:201
@@ -15153,8 +15006,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[2:3], v102 offset:198
; ALIGNED-NEXT: flat_store_byte v[2:3], v113 offset:196
; ALIGNED-NEXT: flat_store_byte v[2:3], v115 offset:192
-; ALIGNED-NEXT: v_lshl_or_b32 v125, v0, 16, v125
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00
; ALIGNED-NEXT: s_addc_u32 s5, s5, -1
; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7]
@@ -15169,6 +15020,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
; ALIGNED-NEXT: flat_store_byte v[2:3], v117 offset:186
; ALIGNED-NEXT: flat_store_byte v[2:3], v40 offset:187
; ALIGNED-NEXT: flat_store_byte v[2:3], v118 offset:185
@@ -15185,7 +15037,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[2:3], v58 offset:182
; ALIGNED-NEXT: flat_store_byte v[2:3], v60 offset:180
; ALIGNED-NEXT: flat_store_byte v[2:3], v61 offset:176
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
@@ -15197,6 +15048,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
; ALIGNED-NEXT: flat_store_byte v[2:3], v63 offset:170
; ALIGNED-NEXT: flat_store_byte v[2:3], v75 offset:171
; ALIGNED-NEXT: flat_store_byte v[2:3], v72 offset:169
@@ -15213,7 +15065,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:166
; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:164
; ALIGNED-NEXT: flat_store_byte v[2:3], v108 offset:160
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
@@ -15225,11 +15076,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload
; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:154
; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:155
; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:153
; ALIGNED-NEXT: flat_store_byte v[2:3], v126 offset:159
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:157
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload
@@ -15747,11 +15598,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:10
; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:11
; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:13
; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:9
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:15
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload
@@ -15788,7 +15639,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_cbranch_scc0 .LBB9_4
; ALIGNED-NEXT: .LBB9_5: ; %Flow11
; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; ALIGNED-NEXT: s_clause 0x2f
+; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32
; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4
; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
index 71900a4..3280048 100644
--- a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
@@ -90,19 +90,19 @@ define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perSh
; GFX12-GISEL-NEXT: s_load_b256 s[20:27], s[2:3], 0x40
; GFX12-GISEL-NEXT: s_load_b512 s[36:51], s[2:3], 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX12-GISEL-NEXT: buffer_load_b32 v2, off, s[16:19], null
; GFX12-GISEL-NEXT: buffer_load_b32 v3, off, s[20:23], null
; GFX12-GISEL-NEXT: buffer_load_b32 v4, off, s[40:43], null
+; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX12-GISEL-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x2
; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x1
-; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v1
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x1
; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s1, 0xac0, v3
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x1
+; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v1
; GFX12-GISEL-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 78207c2..1177474 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -185,44 +185,47 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_add_f32_e32 v4, s43, v4
; GFX900-NEXT: v_add_f32_e32 v3, s42, v3
; GFX900-NEXT: v_add_f32_e32 v2, s41, v2
; GFX900-NEXT: v_add_f32_e32 v1, s40, v1
-; GFX900-NEXT: s_waitcnt vmcnt(6)
-; GFX900-NEXT: v_add_f32_e32 v8, s39, v8
-; GFX900-NEXT: v_add_f32_e32 v7, s38, v7
-; GFX900-NEXT: v_add_f32_e32 v6, s37, v6
-; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_add_f32_e32 v32, s19, v32
; GFX900-NEXT: v_add_f32_e32 v31, s18, v31
; GFX900-NEXT: v_add_f32_e32 v30, s17, v30
; GFX900-NEXT: v_add_f32_e32 v29, s16, v29
+; GFX900-NEXT: s_waitcnt vmcnt(4)
+; GFX900-NEXT: v_add_f32_e32 v8, s39, v8
+; GFX900-NEXT: v_add_f32_e32 v7, s38, v7
+; GFX900-NEXT: v_add_f32_e32 v6, s37, v6
; GFX900-NEXT: v_add_f32_e32 v5, s36, v5
+; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: v_add_f32_e32 v12, s51, v12
; GFX900-NEXT: v_add_f32_e32 v11, s50, v11
; GFX900-NEXT: v_add_f32_e32 v10, s49, v10
; GFX900-NEXT: v_add_f32_e32 v9, s48, v9
+; GFX900-NEXT: s_waitcnt vmcnt(2)
; GFX900-NEXT: v_add_f32_e32 v16, s47, v16
; GFX900-NEXT: v_add_f32_e32 v15, s46, v15
; GFX900-NEXT: v_add_f32_e32 v14, s45, v14
; GFX900-NEXT: v_add_f32_e32 v13, s44, v13
+; GFX900-NEXT: s_waitcnt vmcnt(1)
; GFX900-NEXT: v_add_f32_e32 v20, s15, v20
; GFX900-NEXT: v_add_f32_e32 v19, s14, v19
; GFX900-NEXT: v_add_f32_e32 v18, s13, v18
; GFX900-NEXT: v_add_f32_e32 v17, s12, v17
+; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_add_f32_e32 v24, s11, v24
; GFX900-NEXT: v_add_f32_e32 v23, s10, v23
; GFX900-NEXT: v_add_f32_e32 v22, s9, v22
@@ -246,6 +249,8 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
@@ -255,9 +260,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7)
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[40:41]
; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[42:43]
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
@@ -293,6 +296,8 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -302,9 +307,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7)
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[36:37]
; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[38:39]
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
@@ -340,11 +343,14 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fadd_v32_vs:
; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_clause 0x2
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40
; GFX1250-SDAG-NEXT: s_clause 0x7
; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16
; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48
@@ -354,22 +360,18 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-SDAG-NEXT: s_clause 0x1
-; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v44, s46
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33]
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35]
@@ -409,6 +411,9 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
@@ -421,10 +426,6 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-GISEL-NEXT: s_clause 0x1
-; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
@@ -1442,44 +1443,47 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_mul_f32_e32 v4, s43, v4
; GFX900-NEXT: v_mul_f32_e32 v3, s42, v3
; GFX900-NEXT: v_mul_f32_e32 v2, s41, v2
; GFX900-NEXT: v_mul_f32_e32 v1, s40, v1
-; GFX900-NEXT: s_waitcnt vmcnt(6)
-; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8
-; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7
-; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6
-; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_mul_f32_e32 v32, s19, v32
; GFX900-NEXT: v_mul_f32_e32 v31, s18, v31
; GFX900-NEXT: v_mul_f32_e32 v30, s17, v30
; GFX900-NEXT: v_mul_f32_e32 v29, s16, v29
+; GFX900-NEXT: s_waitcnt vmcnt(4)
+; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8
+; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7
+; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6
; GFX900-NEXT: v_mul_f32_e32 v5, s36, v5
+; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: v_mul_f32_e32 v12, s51, v12
; GFX900-NEXT: v_mul_f32_e32 v11, s50, v11
; GFX900-NEXT: v_mul_f32_e32 v10, s49, v10
; GFX900-NEXT: v_mul_f32_e32 v9, s48, v9
+; GFX900-NEXT: s_waitcnt vmcnt(2)
; GFX900-NEXT: v_mul_f32_e32 v16, s47, v16
; GFX900-NEXT: v_mul_f32_e32 v15, s46, v15
; GFX900-NEXT: v_mul_f32_e32 v14, s45, v14
; GFX900-NEXT: v_mul_f32_e32 v13, s44, v13
+; GFX900-NEXT: s_waitcnt vmcnt(1)
; GFX900-NEXT: v_mul_f32_e32 v20, s15, v20
; GFX900-NEXT: v_mul_f32_e32 v19, s14, v19
; GFX900-NEXT: v_mul_f32_e32 v18, s13, v18
; GFX900-NEXT: v_mul_f32_e32 v17, s12, v17
+; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_mul_f32_e32 v24, s11, v24
; GFX900-NEXT: v_mul_f32_e32 v23, s10, v23
; GFX900-NEXT: v_mul_f32_e32 v22, s9, v22
@@ -1503,6 +1507,8 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
@@ -1512,9 +1518,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7)
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[40:41]
; PACKED-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[42:43]
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
@@ -1550,6 +1554,8 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -1559,9 +1565,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7)
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[36:37]
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[38:39]
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
@@ -1597,11 +1601,14 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fmul_v32_vs:
; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_clause 0x2
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40
; GFX1250-SDAG-NEXT: s_clause 0x7
; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16
; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48
@@ -1611,22 +1618,18 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-SDAG-NEXT: s_clause 0x1
-; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v44, s46
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
@@ -1666,6 +1669,9 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
@@ -1678,10 +1684,6 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-GISEL-NEXT: s_clause 0x1
-; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
@@ -2273,44 +2275,47 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX900: ; %bb.0:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_fma_f32 v4, v4, s43, s43
; GFX900-NEXT: v_fma_f32 v3, v3, s42, s42
; GFX900-NEXT: v_fma_f32 v2, v2, s41, s41
; GFX900-NEXT: v_fma_f32 v1, v1, s40, s40
-; GFX900-NEXT: s_waitcnt vmcnt(6)
-; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39
-; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38
-; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37
-; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_fma_f32 v32, v32, s19, s19
; GFX900-NEXT: v_fma_f32 v31, v31, s18, s18
; GFX900-NEXT: v_fma_f32 v30, v30, s17, s17
; GFX900-NEXT: v_fma_f32 v29, v29, s16, s16
+; GFX900-NEXT: s_waitcnt vmcnt(4)
+; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39
+; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38
+; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37
; GFX900-NEXT: v_fma_f32 v5, v5, s36, s36
+; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: v_fma_f32 v12, v12, s51, s51
; GFX900-NEXT: v_fma_f32 v11, v11, s50, s50
; GFX900-NEXT: v_fma_f32 v10, v10, s49, s49
; GFX900-NEXT: v_fma_f32 v9, v9, s48, s48
+; GFX900-NEXT: s_waitcnt vmcnt(2)
; GFX900-NEXT: v_fma_f32 v16, v16, s47, s47
; GFX900-NEXT: v_fma_f32 v15, v15, s46, s46
; GFX900-NEXT: v_fma_f32 v14, v14, s45, s45
; GFX900-NEXT: v_fma_f32 v13, v13, s44, s44
+; GFX900-NEXT: s_waitcnt vmcnt(1)
; GFX900-NEXT: v_fma_f32 v20, v20, s15, s15
; GFX900-NEXT: v_fma_f32 v19, v19, s14, s14
; GFX900-NEXT: v_fma_f32 v18, v18, s13, s13
; GFX900-NEXT: v_fma_f32 v17, v17, s12, s12
+; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_fma_f32 v24, v24, s11, s11
; GFX900-NEXT: v_fma_f32 v23, v23, s10, s10
; GFX900-NEXT: v_fma_f32 v22, v22, s9, s9
@@ -2334,6 +2339,8 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
@@ -2343,9 +2350,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7)
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[40:41], s[40:41]
; PACKED-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[42:43], s[42:43]
; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
@@ -2381,6 +2386,8 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -2390,9 +2397,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7)
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[36:37], s[36:37]
; PACKED-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[38:39], s[38:39]
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
@@ -2430,6 +2435,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
@@ -2442,10 +2450,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96
; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64
; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-SDAG-NEXT: s_clause 0x1
-; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[40:41]
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[42:43]
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[50:51]
@@ -2496,6 +2500,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
@@ -2508,10 +2515,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80
; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96
; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-GISEL-NEXT: s_clause 0x1
-; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
index d0d5cc1..025d9e6 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
@@ -56,11 +56,11 @@ body: |
; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
; GCN-NEXT: }
- ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (load (s32)) {
; GCN-NEXT: $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
; GCN-NEXT: $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
; GCN-NEXT: }
- ; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
+ ; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (store (s128)) {
; GCN-NEXT: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128))
; GCN-NEXT: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128))
; GCN-NEXT: }
@@ -359,6 +359,7 @@ tracksRegLiveness: true
body: |
bb.0:
; GCN-LABLE: name: no_sched_barrier_within_bundle
+ ; GCN-LABEL: name: no_sched_barrier_within_bundle
; GCN: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
; GCN-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
index 5fea0ae..e0266b9 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
@@ -9,7 +9,7 @@ body: |
; GFX12-LABEL: name: post_bundle_vimage
; GFX12: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
+ ; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) {
; GFX12-NEXT: $vgpr5 = IMAGE_LOAD_V1_V1_gfx12 $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
; GFX12-NEXT: $vgpr4 = IMAGE_LOAD_V1_V1_gfx12 killed $vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
; GFX12-NEXT: }
@@ -25,7 +25,7 @@ body: |
; GFX12-LABEL: name: post_bundle_vsample
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11
; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 {
+ ; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 :: (dereferenceable load (s128), addrspace 8) {
; GFX12-NEXT: $vgpr6_vgpr7_vgpr8_vgpr9 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr0, killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
; GFX12-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr2, killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
; GFX12-NEXT: }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 85a9aba..b91bdd2 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -398,11 +398,11 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[4:5]
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffc800, v2
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffd000, v2
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xffffd800, v2
; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v3, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xffffe000, v2
; GFX8-NEXT: v_addc_u32_e32 v23, vcc, -1, v3, vcc
; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[4:5]
@@ -514,10 +514,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX900-NEXT: ; => This Inner Loop Header: Depth=2
; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffb000, v2
; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v3, vcc
-; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096
-; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048
-; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v2
; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off
+; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v2
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
; GFX900-NEXT: global_load_dwordx2 v[20:21], v[14:15], off
@@ -526,13 +524,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, s3, v2
; GFX900-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
+; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096
+; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048
; GFX900-NEXT: s_addk_i32 s5, 0x2000
; GFX900-NEXT: s_cmp_gt_u32 s5, 0x3fffff
-; GFX900-NEXT: s_waitcnt vmcnt(3)
+; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_add_co_u32_e32 v22, vcc, v8, v4
; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
; GFX900-NEXT: global_load_dwordx2 v[8:9], v[14:15], off offset:-4096
-; GFX900-NEXT: s_waitcnt vmcnt(3)
+; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_add_co_u32_e64 v24, s[0:1], v18, v22
; GFX900-NEXT: v_addc_co_u32_e64 v25, s[0:1], v19, v5, s[0:1]
; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
@@ -540,13 +540,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s4, v2
; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc
; GFX900-NEXT: global_load_dwordx2 v[4:5], v[4:5], off offset:-2048
-; GFX900-NEXT: s_waitcnt vmcnt(5)
+; GFX900-NEXT: s_waitcnt vmcnt(7)
; GFX900-NEXT: v_add_co_u32_e32 v20, vcc, v20, v24
; GFX900-NEXT: global_load_dwordx2 v[14:15], v[2:3], off
; GFX900-NEXT: v_addc_co_u32_e32 v21, vcc, v21, v25, vcc
; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x10000, v2
; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX900-NEXT: s_waitcnt vmcnt(5)
+; GFX900-NEXT: s_waitcnt vmcnt(7)
; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v16, v20
; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v17, v21, vcc
; GFX900-NEXT: s_waitcnt vmcnt(4)
@@ -734,10 +734,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, 0xffffb000, v6
; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, -1, v7, vcc
-; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
-; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
-; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6
; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[12:13], off
+; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6
; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc
; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[14:15], off
@@ -753,39 +751,42 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v7, vcc
; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[22:23], off offset:-2048
; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[6:7], off
+; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
+; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x10000, v6
; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
; GFX90A-NEXT: s_addk_i32 s3, 0x2000
; GFX90A-NEXT: s_cmp_gt_u32 s3, 0x3fffff
-; GFX90A-NEXT: s_waitcnt vmcnt(8)
+; GFX90A-NEXT: s_waitcnt vmcnt(10)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(7)
+; GFX90A-NEXT: s_waitcnt vmcnt(9)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v18, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v19, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(6)
+; GFX90A-NEXT: s_waitcnt vmcnt(8)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v20, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v21, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(5)
+; GFX90A-NEXT: s_waitcnt vmcnt(7)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v16, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v17, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(4)
+; GFX90A-NEXT: s_waitcnt vmcnt(6)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v24, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v25, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(3)
+; GFX90A-NEXT: s_waitcnt vmcnt(5)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v26, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v27, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(2)
+; GFX90A-NEXT: s_waitcnt vmcnt(4)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v28, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v29, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: s_waitcnt vmcnt(3)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc
; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll b/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll
index 118c47e..cac1fe9 100644
--- a/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll
@@ -46,7 +46,7 @@ define amdgpu_ps void @_amdgpu_ps_main(float %arg) {
; GFX900-NEXT: s_mov_b64 exec, 0
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v1, 0
-; GFX900-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-NEXT: v_mov_b32_e32 v2, v1
; GFX900-NEXT: .LBB0_5: ; %bb6
; GFX900-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX900-NEXT: s_waitcnt vmcnt(0)
@@ -75,7 +75,7 @@ bb5:
bb6:
%i7 = phi float [ 0.000000e+00, %bb5 ], [ %i3, %bb1 ]
%i8 = phi float [ 0.000000e+00, %bb5 ], [ 1.000000e+00, %bb1 ]
- %i9 = phi float [ undef, %bb5 ], [ %i4, %bb1 ]
+ %i9 = phi float [ poison, %bb5 ], [ %i4, %bb1 ]
%i10 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float %i7)
%i11 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %i8, float %i9)
call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> %i10, <2 x half> %i11, i1 false, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index 7a3bff8..840916a 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -28,15 +28,20 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s7, 0xe8f000
; SI-NEXT: s_add_u32 s4, s4, s0
; SI-NEXT: s_addc_u32 s5, s5, 0
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -44,26 +49,19 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
@@ -76,19 +74,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -97,8 +84,22 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
@@ -106,17 +107,16 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -153,37 +153,35 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s7, 0xe80000
; VI-NEXT: s_add_u32 s4, s4, s0
; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -193,19 +191,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -213,24 +200,37 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -266,36 +266,33 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4
; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -305,26 +302,30 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
-; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX9-MUBUF-NEXT: s_nop 0
@@ -333,16 +334,15 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -392,7 +392,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -435,6 +434,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -448,8 +448,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -457,6 +455,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -503,7 +503,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -546,6 +545,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -559,8 +559,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -568,6 +566,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -974,42 +974,43 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -1024,8 +1025,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
@@ -1051,15 +1051,20 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s7, 0xe8f000
; SI-NEXT: s_add_u32 s4, s4, s0
; SI-NEXT: s_addc_u32 s5, s5, 0
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -1067,26 +1072,19 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
@@ -1099,19 +1097,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -1120,8 +1107,22 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
@@ -1129,17 +1130,16 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -1176,37 +1176,35 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s7, 0xe80000
; VI-NEXT: s_add_u32 s4, s4, s0
; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -1216,19 +1214,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -1236,24 +1223,37 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -1289,36 +1289,33 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4
; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -1328,26 +1325,30 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
-; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX9-MUBUF-NEXT: s_nop 0
@@ -1356,16 +1357,15 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -1415,7 +1415,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -1458,6 +1457,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -1471,8 +1471,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -1480,6 +1478,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -1526,7 +1526,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -1569,6 +1568,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -1582,8 +1582,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -1591,6 +1589,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -1997,42 +1997,43 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -2047,8 +2048,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
@@ -2074,15 +2074,20 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s7, 0xe8f000
; SI-NEXT: s_add_u32 s4, s4, s0
; SI-NEXT: s_addc_u32 s5, s5, 0
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -2090,26 +2095,19 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
@@ -2122,19 +2120,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -2143,8 +2130,22 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
@@ -2152,17 +2153,16 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -2199,37 +2199,35 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s7, 0xe80000
; VI-NEXT: s_add_u32 s4, s4, s0
; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -2239,19 +2237,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -2259,24 +2246,37 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -2312,36 +2312,33 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4
; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -2351,26 +2348,30 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
-; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX9-MUBUF-NEXT: s_nop 0
@@ -2379,16 +2380,15 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -2438,7 +2438,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -2481,6 +2480,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -2494,8 +2494,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -2503,6 +2501,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -2549,7 +2549,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -2592,6 +2591,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -2605,8 +2605,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -2614,6 +2612,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -3020,42 +3020,43 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -3070,8 +3071,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
@@ -3097,15 +3097,20 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s7, 0xe8f000
; SI-NEXT: s_add_u32 s4, s4, s0
; SI-NEXT: s_addc_u32 s5, s5, 0
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -3113,26 +3118,19 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
@@ -3145,19 +3143,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -3166,8 +3153,22 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
@@ -3175,17 +3176,16 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -3222,37 +3222,35 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s7, 0xe80000
; VI-NEXT: s_add_u32 s4, s4, s0
; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -3262,19 +3260,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -3282,24 +3269,37 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -3334,36 +3334,33 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s5
; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -3373,26 +3370,30 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
-; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX9-MUBUF-NEXT: s_nop 0
@@ -3401,16 +3402,15 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -3459,7 +3459,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -3502,6 +3501,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -3515,8 +3515,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -3524,6 +3522,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -3569,7 +3569,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -3612,6 +3611,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -3625,8 +3625,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -3634,6 +3632,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -4040,42 +4040,43 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -4090,8 +4091,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
@@ -4117,15 +4117,20 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; SI-NEXT: s_mov_b32 s7, 0xe8f000
; SI-NEXT: s_add_u32 s4, s4, s0
; SI-NEXT: s_addc_u32 s5, s5, 0
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -4133,26 +4138,19 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
@@ -4165,19 +4163,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -4186,8 +4173,22 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
@@ -4195,17 +4196,16 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -4242,37 +4242,35 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; VI-NEXT: s_mov_b32 s7, 0xe80000
; VI-NEXT: s_add_u32 s4, s4, s0
; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
-; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256
; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -4282,19 +4280,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -4302,24 +4289,37 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216
; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196
; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -4354,36 +4354,33 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s5
; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -4393,26 +4390,30 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
-; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX9-MUBUF-NEXT: s_nop 0
@@ -4421,16 +4422,15 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -4479,7 +4479,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -4522,6 +4521,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -4535,8 +4535,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -4544,6 +4542,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -4589,7 +4589,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -4632,6 +4631,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -4645,8 +4645,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -4654,6 +4652,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -5060,42 +5060,43 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -5110,8 +5111,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
@@ -5141,15 +5141,20 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: s_mov_b32 s11, 0xe8f000
; SI-NEXT: s_add_u32 s8, s8, s6
; SI-NEXT: s_addc_u32 s9, s9, 0
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
@@ -5157,26 +5162,19 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
-; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
@@ -5189,19 +5187,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
@@ -5210,8 +5197,22 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
@@ -5219,17 +5220,16 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -5267,37 +5267,35 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: s_mov_b32 s11, 0xe80000
; VI-NEXT: s_add_u32 s8, s8, s6
; VI-NEXT: s_addc_u32 s9, s9, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
-; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
-; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -5307,19 +5305,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
@@ -5327,24 +5314,37 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -5380,36 +5380,33 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: s_mov_b32 s11, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s8, s8, s5
; GFX9-MUBUF-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -5419,26 +5416,30 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
-; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; GFX9-MUBUF-NEXT: s_nop 0
@@ -5447,16 +5448,15 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -5491,10 +5491,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: s_mov_b32 s10, -1
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W32-MUBUF-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
; GFX10_W32-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
; GFX10_W32-MUBUF-NEXT: s_addc_u32 s9, s9, 0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
@@ -5505,8 +5505,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -5549,6 +5547,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -5562,8 +5562,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -5571,6 +5569,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -5602,10 +5602,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
@@ -5616,8 +5616,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -5660,6 +5658,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -5673,8 +5673,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -5682,6 +5680,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -6093,10 +6093,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
@@ -6105,29 +6105,31 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -6142,8 +6144,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
@@ -6172,15 +6173,20 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: s_mov_b32 s11, 0xe8f000
; SI-NEXT: s_add_u32 s8, s8, s6
; SI-NEXT: s_addc_u32 s9, s9, 0
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
-; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
-; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
-; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
@@ -6188,26 +6194,19 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
-; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
-; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
-; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
@@ -6220,19 +6219,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
-; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
-; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
-; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
-; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
-; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
-; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
@@ -6241,8 +6229,22 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
@@ -6250,17 +6252,16 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
-; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
-; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
-; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
-; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -6298,37 +6299,35 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: s_mov_b32 s11, 0xe80000
; VI-NEXT: s_add_u32 s8, s8, s6
; VI-NEXT: s_addc_u32 s9, s9, 0
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
-; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
-; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
-; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
-; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
-; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
-; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
-; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
-; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -6338,19 +6337,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
-; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
-; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
-; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
-; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
-; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
-; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
@@ -6358,24 +6346,37 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
-; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
-; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
-; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
-; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -6411,36 +6412,33 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: s_mov_b32 s11, 0xe00000
; GFX9-MUBUF-NEXT: s_add_u32 s8, s8, s5
; GFX9-MUBUF-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
-; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
-; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
-; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
@@ -6450,26 +6448,30 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
-; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
-; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
-; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; GFX9-MUBUF-NEXT: s_nop 0
@@ -6478,16 +6480,15 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
-; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
-; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
-; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
-; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -6522,10 +6523,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: s_mov_b32 s10, -1
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W32-MUBUF-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
; GFX10_W32-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
; GFX10_W32-MUBUF-NEXT: s_addc_u32 s9, s9, 0
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
@@ -6536,8 +6537,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -6580,6 +6579,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -6593,8 +6594,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -6602,6 +6601,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -6633,10 +6634,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
@@ -6647,8 +6648,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -6691,6 +6690,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
@@ -6704,8 +6705,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -6713,6 +6712,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -7124,10 +7125,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
@@ -7136,29 +7137,31 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT: s_clause 0x3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
@@ -7173,8 +7176,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
; GFX11-FLATSCR-NEXT: s_clause 0x4
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
diff --git a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
index 71e4755..c90d788 100644
--- a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
+++ b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
@@ -3,9 +3,6 @@
define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) {
; CHECK-LABEL: excess_soft_clause_reg_pressure:
; CHECK: BB0_1: ; %for.cond28.preheader
-; CHECK: s_load_dwordx16
-; CHECK-NEXT: s_load_dwordx16
-
; CHECK: global_load_dword
; CHECK-NEXT: global_load_dword
; CHECK-NEXT: global_load_dword
@@ -18,11 +15,23 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspa
; CHECK-NOT: v_readlane_b32
; CHECK: s_load_dwordx16
+; CHECK-NEXT: s_load_dwordx16
+
+; CHECK-NOT: v_writelane_b32
+; CHECK-NOT: v_readlane_b32
+
; CHECK: s_load_dwordx16
+; CHECK-NEXT: s_load_dwordx16
+
+; CHECK-NOT: v_writelane_b32
+; CHECK-NOT: v_readlane_b32
+
; CHECK: s_load_dwordx16
+; CHECK-NEXT: s_load_dwordx16
; CHECK-NOT: v_writelane_b32
; CHECK-NOT: v_readlane_b32
+
entry:
%i = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%i2 = load i64, ptr addrspace(4) %i, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index da48af1..1a0f75e 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -448,13 +448,13 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v2, v2, a[0:3]
-; GFX90A-NEXT: s_nop 4
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
; GFX90A-NEXT: buffer_load_dword v2, off, s[8:11], 0 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 50056b6..b5474b8 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10314,7 +10314,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:144
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:224
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040
@@ -10327,12 +10328,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[19:22], v5, s[38:39] offset:192
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[15:18], v5, s[38:39] offset:176
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:160
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:144
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:128
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:112
@@ -10344,7 +10343,9 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:96
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:32
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:16
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:80
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0
@@ -10358,10 +10359,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:32
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:16
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[7:10], v5, s[38:39]
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 1
@@ -10468,13 +10466,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:224
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:208
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37] offset:192
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[15:18], s[36:37] offset:176
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(3)
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:160
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 9cb22da..802de80 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -295,9 +295,9 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 2
-; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: v_writelane_b32 v40, s34, 3
; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:1024
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s34
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index d80ec6b..8f8e2c0 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -655,7 +655,7 @@ bb:
br label %bb5
bb5: ; preds = %bb5.backedge, %bb
- %tmp4.i.sroa.0.0 = phi <9 x double> [ undef, %bb ], [ %tmp4.i.sroa.0.1, %bb5.backedge ]
+ %tmp4.i.sroa.0.0 = phi <9 x double> [ poison, %bb ], [ %tmp4.i.sroa.0.1, %bb5.backedge ]
%tmp14.1.i = load i32, ptr inttoptr (i64 128 to ptr), align 128
store i32 0, ptr addrspace(5) null, align 4
%tmp14.2.i = load i32, ptr inttoptr (i64 128 to ptr), align 128
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
new file mode 100644
index 0000000..696962a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 %s --passes=two-address-instruction -verify-each -o - | FileCheck --check-prefixes=GCN %s
+
+# Exercise very basic handling of BUNDLE'd instructions by the two-address-instruction pass.
+
+# This test is an example where it is best to keep the two-address instruction
+# and resolve the tie with a COPY that is expected to be coalesced.
+---
+name: test_fmac_bundle
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: test_fmac_bundle
+ ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+ ; GCN-NEXT: BUNDLE implicit-def [[COPY2]], implicit [[DEF]], implicit [[DEF1]], implicit [[COPY2]](tied-def 0), implicit $mode, implicit $exec {
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]], killed [[DEF1]], killed [[COPY2]], implicit $mode, implicit $exec
+ ; GCN-NEXT: }
+ %10:vgpr_32 = COPY $vgpr0
+ %11:vgpr_32 = COPY $vgpr1
+ %2:vgpr_32 = V_ADD_U32_e64 %10, %11, 0, implicit $exec
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit killed %2(tied-def 0), implicit $mode, implicit $exec {
+ %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit $mode, implicit $exec
+ }
+
+...
+
+# This test is an example where conversion to three-address form would be beneficial.
+---
+name: test_fmac_reuse_bundle
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: test_fmac_reuse_bundle
+ ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GCN-NEXT: BUNDLE implicit-def [[COPY1]], implicit [[DEF]], implicit [[DEF1]], implicit [[COPY1]](tied-def 0), implicit $mode, implicit $exec {
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]], killed [[DEF1]], killed [[COPY1]], implicit $mode, implicit $exec
+ ; GCN-NEXT: }
+ ; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], [[COPY]], 0, implicit $exec
+ %2:vgpr_32 = COPY $vgpr0
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit %2(tied-def 0), implicit $mode, implicit $exec {
+ %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit $mode, implicit $exec
+ }
+ %4:vgpr_32 = V_ADD_U32_e64 %3, %2, 0, implicit $exec
+
+...
diff --git a/llvm/test/CodeGen/DirectX/llvm_assume.ll b/llvm/test/CodeGen/DirectX/llvm_assume.ll
new file mode 100644
index 0000000..d739592
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/llvm_assume.ll
@@ -0,0 +1,9 @@
+; RUN: opt -S -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+
+define void @test_llvm_assume(i1 %0) {
+; CHECK-LABEL: test_llvm_assume
+; CHECK-NEXT: ret void
+tail call void @llvm.assume(i1 %0)
+ret void
+}
+
diff --git a/llvm/test/CodeGen/DirectX/scalarize-alloca.ll b/llvm/test/CodeGen/DirectX/scalarize-alloca.ll
index a8557e4..475935d 100644
--- a/llvm/test/CodeGen/DirectX/scalarize-alloca.ll
+++ b/llvm/test/CodeGen/DirectX/scalarize-alloca.ll
@@ -42,3 +42,68 @@ define void @alloca_2d_gep_test() {
%3 = getelementptr inbounds nuw [2 x <2 x i32>], ptr %1, i32 0, i32 %2
ret void
}
+
+; CHECK-LABEL: subtype_array_test
+define void @subtype_array_test() {
+ ; SCHECK: [[alloca_val:%.*]] = alloca [8 x [4 x i32]], align 4
+ ; FCHECK: [[alloca_val:%.*]] = alloca [32 x i32], align 4
+ ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
+ ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr [[alloca_val]], i32 0, i32 [[tid]]
+ ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 4
+ ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]]
+ ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr [[alloca_val]], i32 0, i32 [[flatidx]]
+ ; CHECK: ret void
+ %arr = alloca [8 x [4 x i32]], align 4
+ %i = tail call i32 @llvm.dx.thread.id(i32 0)
+ %gep = getelementptr inbounds nuw [4 x i32], ptr %arr, i32 %i
+ ret void
+}
+
+; CHECK-LABEL: subtype_vector_test
+define void @subtype_vector_test() {
+ ; SCHECK: [[alloca_val:%.*]] = alloca [8 x [4 x i32]], align 4
+ ; FCHECK: [[alloca_val:%.*]] = alloca [32 x i32], align 4
+ ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
+ ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr [[alloca_val]], i32 0, i32 [[tid]]
+ ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 4
+ ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]]
+ ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr [[alloca_val]], i32 0, i32 [[flatidx]]
+ ; CHECK: ret void
+ %arr = alloca [8 x <4 x i32>], align 4
+ %i = tail call i32 @llvm.dx.thread.id(i32 0)
+ %gep = getelementptr inbounds nuw <4 x i32>, ptr %arr, i32 %i
+ ret void
+}
+
+; CHECK-LABEL: subtype_scalar_test
+define void @subtype_scalar_test() {
+ ; SCHECK: [[alloca_val:%.*]] = alloca [8 x [4 x i32]], align 4
+ ; FCHECK: [[alloca_val:%.*]] = alloca [32 x i32], align 4
+ ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
+ ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr [[alloca_val]], i32 0, i32 0, i32 [[tid]]
+ ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 1
+ ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]]
+ ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr [[alloca_val]], i32 0, i32 [[flatidx]]
+ ; CHECK: ret void
+ %arr = alloca [8 x [4 x i32]], align 4
+ %i = tail call i32 @llvm.dx.thread.id(i32 0)
+ %gep = getelementptr inbounds nuw i32, ptr %arr, i32 %i
+ ret void
+}
+
+; CHECK-LABEL: subtype_i8_test
+define void @subtype_i8_test() {
+ ; SCHECK: [[alloca_val:%.*]] = alloca [8 x [4 x i32]], align 4
+ ; FCHECK: [[alloca_val:%.*]] = alloca [32 x i32], align 4
+ ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
+ ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw i8, ptr [[alloca_val]], i32 [[tid]]
+ ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 1
+ ; FCHECK: [[flatidx_lshr:%.*]] = lshr i32 [[flatidx_mul]], 2
+ ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_lshr]]
+ ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr [[alloca_val]], i32 0, i32 [[flatidx]]
+ ; CHECK: ret void
+ %arr = alloca [8 x [4 x i32]], align 4
+ %i = tail call i32 @llvm.dx.thread.id(i32 0)
+ %gep = getelementptr inbounds nuw i8, ptr %arr, i32 %i
+ ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/scalarize-global.ll b/llvm/test/CodeGen/DirectX/scalarize-global.ll
new file mode 100644
index 0000000..ca10f6e
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/scalarize-global.ll
@@ -0,0 +1,70 @@
+; RUN: opt -S -passes='dxil-data-scalarization' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=SCHECK,CHECK
+; RUN: opt -S -passes='dxil-data-scalarization,dxil-flatten-arrays' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=FCHECK,CHECK
+
+@"arrayofVecData" = local_unnamed_addr addrspace(3) global [8 x <4 x i32>] zeroinitializer, align 16
+@"vecData" = external addrspace(3) global <4 x i32>, align 4
+
+; SCHECK: [[arrayofVecData:@arrayofVecData.*]] = local_unnamed_addr addrspace(3) global [8 x [4 x i32]] zeroinitializer, align 16
+; FCHECK: [[arrayofVecData:@arrayofVecData.*]] = local_unnamed_addr addrspace(3) global [32 x i32] zeroinitializer, align 16
+; CHECK: [[vecData:@vecData.*]] = external addrspace(3) global [4 x i32], align 4
+
+; CHECK-LABEL: subtype_array_test
+define <4 x i32> @subtype_array_test() {
+ ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
+ ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[tid]]
+ ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 4
+ ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]]
+ ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[flatidx]]
+ ; CHECK: [[x:%.*]] = load <4 x i32>, ptr addrspace(3) [[gep]], align 4
+ ; CHECK: ret <4 x i32> [[x]]
+ %i = tail call i32 @llvm.dx.thread.id(i32 0)
+ %gep = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) @"arrayofVecData", i32 %i
+ %x = load <4 x i32>, ptr addrspace(3) %gep, align 4
+ ret <4 x i32> %x
+}
+
+; CHECK-LABEL: subtype_vector_test
+define <4 x i32> @subtype_vector_test() {
+ ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
+ ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[tid]]
+ ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 4
+ ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]]
+ ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[flatidx]]
+ ; CHECK: [[x:%.*]] = load <4 x i32>, ptr addrspace(3) [[gep]], align 4
+ ; CHECK: ret <4 x i32> [[x]]
+ %i = tail call i32 @llvm.dx.thread.id(i32 0)
+ %gep = getelementptr inbounds nuw <4 x i32>, ptr addrspace(3) @"arrayofVecData", i32 %i
+ %x = load <4 x i32>, ptr addrspace(3) %gep, align 4
+ ret <4 x i32> %x
+}
+
+; CHECK-LABEL: subtype_scalar_test
+define <4 x i32> @subtype_scalar_test() {
+ ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
+ ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 0, i32 [[tid]]
+ ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 1
+ ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]]
+ ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[flatidx]]
+ ; CHECK: [[x:%.*]] = load <4 x i32>, ptr addrspace(3) [[gep]], align 4
+ ; CHECK: ret <4 x i32> [[x]]
+ %i = tail call i32 @llvm.dx.thread.id(i32 0)
+ %gep = getelementptr inbounds nuw i32, ptr addrspace(3) @"arrayofVecData", i32 %i
+ %x = load <4 x i32>, ptr addrspace(3) %gep, align 4
+ ret <4 x i32> %x
+}
+
+; CHECK-LABEL: subtype_i8_test
+define <4 x i32> @subtype_i8_test() {
+ ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
+ ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[arrayofVecData]], i32 [[tid]]
+ ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 1
+ ; FCHECK: [[flatidx_lshr:%.*]] = lshr i32 [[flatidx_mul]], 2
+ ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_lshr]]
+ ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[flatidx]]
+ ; CHECK: [[x:%.*]] = load <4 x i32>, ptr addrspace(3) [[gep]], align 4
+ ; CHECK: ret <4 x i32> [[x]]
+ %i = tail call i32 @llvm.dx.thread.id(i32 0)
+ %gep = getelementptr inbounds nuw i8, ptr addrspace(3) @"arrayofVecData", i32 %i
+ %x = load <4 x i32>, ptr addrspace(3) %gep, align 4
+ ret <4 x i32> %x
+}
diff --git a/llvm/test/CodeGen/Generic/reloc-none.ll b/llvm/test/CodeGen/Generic/reloc-none.ll
new file mode 100644
index 0000000..0c8b7a5
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/reloc-none.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s | FileCheck %s
+
+; CHECK: .reloc {{.*}}, BFD_RELOC_NONE, foo
+
+define void @test_reloc_none() {
+ call void @llvm.reloc.none(metadata !"foo")
+ ret void
+}
+
+declare void @llvm.reloc.none(metadata)
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/xqf-fixup-qfp1.ll b/llvm/test/CodeGen/Hexagon/autohvx/xqf-fixup-qfp1.ll
new file mode 100644
index 0000000..9625a605
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/xqf-fixup-qfp1.ll
@@ -0,0 +1,372 @@
+; REQUIRES: hexagon-registered-target, silver
+; This tests correct handling of register spills and fills of
+; qf operands during register allocation.
+
+; RUN: llc -mcpu=hexagonv79 -mattr=+hvx-length128b,+hvxv79,+hvx-ieee-fp,+hvx-qfloat,-long-calls -debug-only=handle-qfp %s 2>&1 -o - | FileCheck %s --check-prefixes V79-81,V79
+; RUN: llc -mcpu=hexagonv81 -mattr=+hvx-length128b,+hvxv81,+hvx-ieee-fp,+hvx-qfloat,-long-calls -debug-only=handle-qfp %s 2>&1 -o - | FileCheck %s --check-prefixes V79-81,V81
+
+; V79-81: Finding uses of: renamable $w{{[0-9]+}} = V6_vmpy_qf32_hf
+; V79-81: Inserting after conv: [[VREG0:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG0]]
+; V79-81-NEXT: Inserting after conv: [[VREG1:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG1]]
+; V79-81: Finding uses of: renamable $w{{[0-9]+}} = V6_vmpy_qf32_hf
+; V79-81: Inserting after conv: [[VREG2:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG2]]
+; V79-81-NEXT: Inserting after conv: [[VREG3:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG3]]
+; V79-81: Finding uses of: renamable $w{{[0-9]+}} = V6_vmpy_qf32_hf
+; V79-81-DAG: Inserting after conv: [[VREG4:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG4]]
+; V79-81-DAG: Inserting after conv: [[VREG5:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG5]]
+; V79-81-DAG: Inserting new instruction: $v{{[0-9]+}} = V6_vadd_sf killed renamable [[VREG2]], killed renamable [[VREG0]]
+; V79-81-DAG: Inserting new instruction: $v{{[0-9]+}} = V6_vsub_sf killed renamable $v{{[0-9]+}}, killed renamable $v{{[0-9]+}}
+;
+; V79-81: Analyzing convert instruction: renamable [[VREG6:\$v[0-9]+]] = V6_vconv_hf_qf32 killed renamable $w{{[0-9]+}}
+; V79: Inserting new instruction: [[VREG30:\$v[0-9]+]] = V6_vd0
+; V79-NEXT: Inserting new instruction: [[VREG7:\$v[0-9]+]] = V6_vadd_sf killed renamable [[VREG7]], killed [[VREG30]]
+; V79: Inserting new instruction: [[VREG30]] = V6_vd0
+; V79-NEXT: Inserting new instruction: [[VREG8:\$v[0-9]+]] = V6_vadd_sf killed renamable [[VREG8]], killed [[VREG30]]
+; V81: Inserting new instruction: [[VREG7:\$v[0-9]+]] = V6_vconv_qf32_sf killed renamable [[VREG7]]
+; V81: Inserting new instruction: [[VREG8:\$v[0-9]+]] = V6_vconv_qf32_sf killed renamable [[VREG8]]
+
+; V79-81: Analyzing convert instruction: renamable [[VREG9:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable $v{{[0-9]+}}
+; V79: Inserting new instruction: [[VREG30]] = V6_vd0
+; V79-NEXT: Inserting new instruction: [[VREG10:\$v[0-9]+]] = V6_vadd_sf killed renamable [[VREG10]], killed [[VREG30]]
+; V81: Inserting new instruction: [[VREG8:\$v[0-9]+]] = V6_vconv_qf32_sf killed renamable [[VREG8]]
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+@.str.1 = private unnamed_addr constant [9 x i8] c"0x%08lx \00", align 1
+@.str.3 = private unnamed_addr constant [173 x i8] c"/prj/qct/llvm/devops/aether/hexbuild/test_trees/MASTER/test/regress/features/hexagon/arch_v68/hvx_ieee_fp/hvx_ieee_fp_test.c:126 0 && \22ERROR: Failed to acquire HVX unit.\\n\22\00", align 1
+@__func__.main = private unnamed_addr constant [5 x i8] c"main\00", align 1
+@.str.5 = private unnamed_addr constant [33 x i8] c"half -3 converted to vhf = %.2f\0A\00", align 1
+@.str.6 = private unnamed_addr constant [35 x i8] c"uhalf 32k converted to vhf = %.2f\0A\00", align 1
+@.str.7 = private unnamed_addr constant [32 x i8] c"sf 0.5 converted to vhf = %.2f\0A\00", align 1
+@.str.8 = private unnamed_addr constant [32 x i8] c"vhf 4.0 conveted to ubyte = %d\0A\00", align 1
+@.str.9 = private unnamed_addr constant [32 x i8] c"vhf 2.0 conveted to uhalf = %d\0A\00", align 1
+@.str.10 = private unnamed_addr constant [30 x i8] c"byte 4 conveted to hf = %.2f\0A\00", align 1
+@.str.11 = private unnamed_addr constant [31 x i8] c"ubyte 4 conveted to hf = %.2f\0A\00", align 1
+@.str.12 = private unnamed_addr constant [27 x i8] c"hf -3 conveted to sf = %f\0A\00", align 1
+@.str.13 = private unnamed_addr constant [31 x i8] c"vhf 4.0 conveted to byte = %d\0A\00", align 1
+@.str.14 = private unnamed_addr constant [31 x i8] c"vhf 4.0 conveted to half = %d\0A\00", align 1
+@.str.16 = private unnamed_addr constant [33 x i8] c"max of hf 2.0 and hf 4.0 = %.2f\0A\00", align 1
+@.str.17 = private unnamed_addr constant [33 x i8] c"min of hf 2.0 and hf 4.0 = %.2f\0A\00", align 1
+@.str.18 = private unnamed_addr constant [32 x i8] c"max of sf 0.5 and sf 0.25 = %f\0A\00", align 1
+@.str.19 = private unnamed_addr constant [32 x i8] c"min of sf 0.5 and sf 0.25 = %f\0A\00", align 1
+@.str.21 = private unnamed_addr constant [25 x i8] c"negate of hf 4.0 = %.2f\0A\00", align 1
+@.str.22 = private unnamed_addr constant [23 x i8] c"abs of hf -6.0 = %.2f\0A\00", align 1
+@.str.23 = private unnamed_addr constant [23 x i8] c"negate of sf 0.5 = %f\0A\00", align 1
+@.str.24 = private unnamed_addr constant [22 x i8] c"abs of sf -0.25 = %f\0A\00", align 1
+@.str.26 = private unnamed_addr constant [32 x i8] c"hf add of 4.0 and -6.0 = %.2f\0A\00", align 1
+@.str.27 = private unnamed_addr constant [32 x i8] c"hf sub of 4.0 and -6.0 = %.2f\0A\00", align 1
+@.str.28 = private unnamed_addr constant [31 x i8] c"sf add of 0.5 and -0.25 = %f\0A\00", align 1
+@.str.29 = private unnamed_addr constant [31 x i8] c"sf sub of 0.5 and -0.25 = %f\0A\00", align 1
+@.str.30 = private unnamed_addr constant [36 x i8] c"sf add of hf 4.0 and hf -6.0 = %f\0A\00", align 1
+@.str.31 = private unnamed_addr constant [36 x i8] c"sf sub of hf 4.0 and hf -6.0 = %f\0A\00", align 1
+@.str.33 = private unnamed_addr constant [32 x i8] c"hf mpy of 4.0 and -6.0 = %.2f\0A\00", align 1
+@.str.34 = private unnamed_addr constant [35 x i8] c"hf accmpy of 4.0 and -6.0 = %.2f\0A\00", align 1
+@.str.35 = private unnamed_addr constant [36 x i8] c"sf mpy of hf 4.0 and hf -6.0 = %f\0A\00", align 1
+@.str.36 = private unnamed_addr constant [39 x i8] c"sf accmpy of hf 4.0 and hf -6.0 = %f\0A\00", align 1
+@.str.37 = private unnamed_addr constant [31 x i8] c"sf mpy of 0.5 and -0.25 = %f\0A\00", align 1
+@.str.39 = private unnamed_addr constant [25 x i8] c"w copy from sf 0.5 = %f\0A\00", align 1
+@str = private unnamed_addr constant [35 x i8] c"ERROR: Failed to acquire HVX unit.\00", align 1
+@str.40 = private unnamed_addr constant [25 x i8] c"\0AConversion intructions\0A\00", align 1
+@str.41 = private unnamed_addr constant [23 x i8] c"\0AMin/Max instructions\0A\00", align 1
+@str.42 = private unnamed_addr constant [23 x i8] c"\0Aabs/neg instructions\0A\00", align 1
+@str.43 = private unnamed_addr constant [23 x i8] c"\0Aadd/sub instructions\0A\00", align 1
+@str.44 = private unnamed_addr constant [24 x i8] c"\0Amultiply instructions\0A\00", align 1
+@str.45 = private unnamed_addr constant [19 x i8] c"\0Acopy instruction\0A\00", align 1
+
+declare dso_local void @print_vector_words(<32 x i32> noundef %x) local_unnamed_addr #0
+
+; Function Attrs: nofree nounwind optsize
+declare dso_local noundef i32 @printf(ptr nocapture noundef readonly, ...) local_unnamed_addr #0
+
+; Function Attrs: nounwind optsize
+define dso_local i32 @main(i32 noundef %argc, ptr nocapture noundef readnone %argv) local_unnamed_addr #1 {
+entry:
+ %call = tail call i32 @acquire_vector_unit(i8 noundef zeroext 0) #6
+ %tobool.not = icmp eq i32 %call, 0
+ br i1 %tobool.not, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %puts = tail call i32 @puts(ptr nonnull dereferenceable(1) @str)
+ tail call void @_Assert(ptr noundef nonnull @.str.3, ptr noundef nonnull @__func__.main) #7
+ unreachable
+
+if.end: ; preds = %entry
+ tail call void @set_double_vector_mode() #6
+ %0 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 16384)
+ %1 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 17408)
+ %2 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 -14848)
+ %3 = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1056964608)
+ %4 = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1048576000)
+ %5 = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 -1098907648)
+ %6 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 -3)
+ %7 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 32768)
+ %puts147 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.40)
+ %8 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.hf.h.128B(<32 x i32> %6)
+ %bc.i = bitcast <32 x i32> %8 to <64 x half>
+ %9 = extractelement <64 x half> %bc.i, i64 0
+ %conv = fpext half %9 to double
+ %call12 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.5, double noundef %conv) #6
+ %10 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.hf.uh.128B(<32 x i32> %7)
+ %bc.i153 = bitcast <32 x i32> %10 to <64 x half>
+ %11 = extractelement <64 x half> %bc.i153, i64 0
+ %conv14 = fpext half %11 to double
+ %call15 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.6, double noundef %conv14) #6
+ %12 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.hf.sf.128B(<32 x i32> %3, <32 x i32> %3)
+ %bc.i155 = bitcast <32 x i32> %12 to <64 x half>
+ %13 = extractelement <64 x half> %bc.i155, i64 0
+ %conv17 = fpext half %13 to double
+ %call18 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.7, double noundef %conv17) #6
+ %14 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.ub.hf.128B(<32 x i32> %1, <32 x i32> %1)
+ %15 = bitcast <32 x i32> %14 to <128 x i8>
+ %conv.i = extractelement <128 x i8> %15, i64 0
+ %conv20 = zext i8 %conv.i to i32
+ %call21 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.8, i32 noundef %conv20) #6
+ %16 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.uh.hf.128B(<32 x i32> %0)
+ %17 = bitcast <32 x i32> %16 to <64 x i16>
+ %conv.i157 = extractelement <64 x i16> %17, i64 0
+ %conv23 = sext i16 %conv.i157 to i32
+ %call24 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.9, i32 noundef %conv23) #6
+ %18 = tail call <64 x i32> @llvm.hexagon.V6.vcvt.hf.b.128B(<32 x i32> %14)
+ %bc.i158 = bitcast <64 x i32> %18 to <128 x half>
+ %19 = extractelement <128 x half> %bc.i158, i64 0
+ %conv26 = fpext half %19 to double
+ %call27 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.10, double noundef %conv26) #6
+ %20 = tail call <64 x i32> @llvm.hexagon.V6.vcvt.hf.ub.128B(<32 x i32> %14)
+ %bc.i159 = bitcast <64 x i32> %20 to <128 x half>
+ %21 = extractelement <128 x half> %bc.i159, i64 0
+ %conv29 = fpext half %21 to double
+ %call30 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.11, double noundef %conv29) #6
+ %22 = tail call <64 x i32> @llvm.hexagon.V6.vcvt.sf.hf.128B(<32 x i32> %8)
+ %bc.i161 = bitcast <64 x i32> %22 to <64 x float>
+ %23 = extractelement <64 x float> %bc.i161, i64 0
+ %conv32 = fpext float %23 to double
+ %call33 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.12, double noundef %conv32) #6
+ %24 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.b.hf.128B(<32 x i32> %1, <32 x i32> %1)
+ %25 = bitcast <32 x i32> %24 to <128 x i8>
+ %conv.i162 = extractelement <128 x i8> %25, i64 0
+ %conv35 = zext i8 %conv.i162 to i32
+ %call36 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.13, i32 noundef %conv35) #6
+ %26 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.h.hf.128B(<32 x i32> %1)
+ %27 = bitcast <32 x i32> %26 to <64 x i16>
+ %conv.i163 = extractelement <64 x i16> %27, i64 0
+ %conv38 = sext i16 %conv.i163 to i32
+ %call39 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.14, i32 noundef %conv38) #6
+ %28 = tail call <32 x i32> @llvm.hexagon.V6.vfmax.hf.128B(<32 x i32> %0, <32 x i32> %1)
+ %puts148 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.41)
+ %bc.i164 = bitcast <32 x i32> %28 to <64 x half>
+ %29 = extractelement <64 x half> %bc.i164, i64 0
+ %conv42 = fpext half %29 to double
+ %call43 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.16, double noundef %conv42) #6
+ %30 = tail call <32 x i32> @llvm.hexagon.V6.vfmin.hf.128B(<32 x i32> %0, <32 x i32> %1)
+ %bc.i166 = bitcast <32 x i32> %30 to <64 x half>
+ %31 = extractelement <64 x half> %bc.i166, i64 0
+ %conv45 = fpext half %31 to double
+ %call46 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.17, double noundef %conv45) #6
+ %32 = tail call <32 x i32> @llvm.hexagon.V6.vfmax.sf.128B(<32 x i32> %3, <32 x i32> %4)
+ %bc.i168 = bitcast <32 x i32> %32 to <32 x float>
+ %33 = extractelement <32 x float> %bc.i168, i64 0
+ %conv48 = fpext float %33 to double
+ %call49 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.18, double noundef %conv48) #6
+ %34 = tail call <32 x i32> @llvm.hexagon.V6.vfmin.sf.128B(<32 x i32> %3, <32 x i32> %4)
+ %bc.i169 = bitcast <32 x i32> %34 to <32 x float>
+ %35 = extractelement <32 x float> %bc.i169, i64 0
+ %conv51 = fpext float %35 to double
+ %call52 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.19, double noundef %conv51) #6
+ %puts149 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.42)
+ %36 = tail call <32 x i32> @llvm.hexagon.V6.vfneg.hf.128B(<32 x i32> %1)
+ %bc.i170 = bitcast <32 x i32> %36 to <64 x half>
+ %37 = extractelement <64 x half> %bc.i170, i64 0
+ %conv55 = fpext half %37 to double
+ %call56 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.21, double noundef %conv55) #6
+ %38 = tail call <32 x i32> @llvm.hexagon.V6.vabs.hf.128B(<32 x i32> %2)
+ %bc.i172 = bitcast <32 x i32> %38 to <64 x half>
+ %39 = extractelement <64 x half> %bc.i172, i64 0
+ %conv58 = fpext half %39 to double
+ %call59 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.22, double noundef %conv58) #6
+ %40 = tail call <32 x i32> @llvm.hexagon.V6.vfneg.sf.128B(<32 x i32> %3)
+ %bc.i174 = bitcast <32 x i32> %40 to <32 x float>
+ %41 = extractelement <32 x float> %bc.i174, i64 0
+ %conv61 = fpext float %41 to double
+ %call62 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.23, double noundef %conv61) #6
+ %42 = tail call <32 x i32> @llvm.hexagon.V6.vabs.sf.128B(<32 x i32> %5)
+ %bc.i175 = bitcast <32 x i32> %42 to <32 x float>
+ %43 = extractelement <32 x float> %bc.i175, i64 0
+ %conv64 = fpext float %43 to double
+ %call65 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.24, double noundef %conv64) #6
+ %puts150 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.43)
+ %44 = tail call <32 x i32> @llvm.hexagon.V6.vadd.hf.hf.128B(<32 x i32> %1, <32 x i32> %2)
+ %bc.i176 = bitcast <32 x i32> %44 to <64 x half>
+ %45 = extractelement <64 x half> %bc.i176, i64 0
+ %conv68 = fpext half %45 to double
+ %call69 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.26, double noundef %conv68) #6
+ %46 = tail call <32 x i32> @llvm.hexagon.V6.vsub.hf.hf.128B(<32 x i32> %1, <32 x i32> %2)
+ %bc.i178 = bitcast <32 x i32> %46 to <64 x half>
+ %47 = extractelement <64 x half> %bc.i178, i64 0
+ %conv71 = fpext half %47 to double
+ %call72 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.27, double noundef %conv71) #6
+ %48 = tail call <32 x i32> @llvm.hexagon.V6.vadd.sf.sf.128B(<32 x i32> %3, <32 x i32> %5)
+ %bc.i180 = bitcast <32 x i32> %48 to <32 x float>
+ %49 = extractelement <32 x float> %bc.i180, i64 0
+ %conv74 = fpext float %49 to double
+ %call75 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.28, double noundef %conv74) #6
+ %50 = tail call <32 x i32> @llvm.hexagon.V6.vsub.sf.sf.128B(<32 x i32> %3, <32 x i32> %5)
+ %bc.i181 = bitcast <32 x i32> %50 to <32 x float>
+ %51 = extractelement <32 x float> %bc.i181, i64 0
+ %conv77 = fpext float %51 to double
+ %call78 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.29, double noundef %conv77) #6
+ %52 = tail call <64 x i32> @llvm.hexagon.V6.vadd.sf.hf.128B(<32 x i32> %1, <32 x i32> %2)
+ %bc.i182 = bitcast <64 x i32> %52 to <64 x float>
+ %53 = extractelement <64 x float> %bc.i182, i64 0
+ %conv80 = fpext float %53 to double
+ %call81 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.30, double noundef %conv80) #6
+ %54 = tail call <64 x i32> @llvm.hexagon.V6.vsub.sf.hf.128B(<32 x i32> %1, <32 x i32> %2)
+ %bc.i183 = bitcast <64 x i32> %54 to <64 x float>
+ %55 = extractelement <64 x float> %bc.i183, i64 0
+ %conv83 = fpext float %55 to double
+ %call84 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.31, double noundef %conv83) #6
+ %puts151 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.44)
+ %56 = tail call <32 x i32> @llvm.hexagon.V6.vmpy.hf.hf.128B(<32 x i32> %1, <32 x i32> %2)
+ %bc.i184 = bitcast <32 x i32> %56 to <64 x half>
+ %57 = extractelement <64 x half> %bc.i184, i64 0
+ %conv87 = fpext half %57 to double
+ %call88 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.33, double noundef %conv87) #6
+ %58 = tail call <32 x i32> @llvm.hexagon.V6.vmpy.hf.hf.acc.128B(<32 x i32> %56, <32 x i32> %1, <32 x i32> %2)
+ %bc.i186 = bitcast <32 x i32> %58 to <64 x half>
+ %59 = extractelement <64 x half> %bc.i186, i64 0
+ %conv90 = fpext half %59 to double
+ %call91 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.34, double noundef %conv90) #6
+ %60 = tail call <64 x i32> @llvm.hexagon.V6.vmpy.sf.hf.128B(<32 x i32> %1, <32 x i32> %2)
+ %bc.i188 = bitcast <64 x i32> %60 to <64 x float>
+ %61 = extractelement <64 x float> %bc.i188, i64 0
+ %conv93 = fpext float %61 to double
+ %call94 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.35, double noundef %conv93) #6
+ %62 = tail call <64 x i32> @llvm.hexagon.V6.vmpy.sf.hf.acc.128B(<64 x i32> %60, <32 x i32> %1, <32 x i32> %2)
+ %bc.i189 = bitcast <64 x i32> %62 to <64 x float>
+ %63 = extractelement <64 x float> %bc.i189, i64 0
+ %conv96 = fpext float %63 to double
+ %call97 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.36, double noundef %conv96) #6
+ %64 = tail call <32 x i32> @llvm.hexagon.V6.vmpy.sf.sf.128B(<32 x i32> %3, <32 x i32> %5)
+ %bc.i190 = bitcast <32 x i32> %64 to <32 x float>
+ %65 = extractelement <32 x float> %bc.i190, i64 0
+ %conv99 = fpext float %65 to double
+ %call100 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.37, double noundef %conv99) #6
+ %puts152 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.45)
+ %66 = tail call <32 x i32> @llvm.hexagon.V6.vassign.fp.128B(<32 x i32> %3)
+ %bc.i191 = bitcast <32 x i32> %66 to <32 x float>
+ %67 = extractelement <32 x float> %bc.i191, i64 0
+ %conv103 = fpext float %67 to double
+ %call104 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.39, double noundef %conv103) #6
+ ret i32 0
+}
+
+; Function Attrs: optsize
+declare dso_local i32 @acquire_vector_unit(i8 noundef zeroext) local_unnamed_addr #2
+
+; Function Attrs: noreturn nounwind optsize
+declare dso_local void @_Assert(ptr noundef, ptr noundef) local_unnamed_addr #3
+
+; Function Attrs: optsize
+declare dso_local void @set_double_vector_mode(...) local_unnamed_addr #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.hf.h.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.hf.uh.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.hf.sf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.ub.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.uh.hf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vcvt.hf.b.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vcvt.hf.ub.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vcvt.sf.hf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.b.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.h.hf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vfmax.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vfmin.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vfmax.sf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vfmin.sf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vfneg.hf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vabs.hf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vfneg.sf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vabs.sf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vadd.hf.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vsub.hf.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vadd.sf.sf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vsub.sf.sf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vadd.sf.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vsub.sf.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vmpy.hf.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vmpy.hf.hf.acc.128B(<32 x i32>, <32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vmpy.sf.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vmpy.sf.hf.acc.128B(<64 x i32>, <32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vmpy.sf.sf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vassign.fp.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32) #4
+
+; Function Attrs: nofree nounwind
+declare noundef i32 @putchar(i32 noundef) local_unnamed_addr #5
+
+; Function Attrs: nofree nounwind
+declare noundef i32 @puts(ptr nocapture noundef readonly) local_unnamed_addr #5
diff --git a/llvm/test/CodeGen/Hexagon/hvx-vsub-qf-sf-mix.ll b/llvm/test/CodeGen/Hexagon/hvx-vsub-qf-sf-mix.ll
new file mode 100644
index 0000000..cdb779f
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/hvx-vsub-qf-sf-mix.ll
@@ -0,0 +1,60 @@
+;; RUN: llc --mtriple=hexagon --mcpu=hexagonv81 --mattr=+hvxv81,+hvx-length128b %s -o - | FileCheck %s
+
+define void @mul_and_sub_1(ptr readonly %A, ptr readonly %B, ptr readonly %C, ptr writeonly %D) {
+entry:
+ %AVec = load <32 x float>, ptr %A, align 4
+ %BVec = load <32 x float>, ptr %B, align 4
+ %CVec = load <32 x float>, ptr %C, align 4
+ %AtBVec = fmul <32 x float> %AVec, %BVec
+
+ %DVec = fsub <32 x float> %CVec, %AtBVec
+ store <32 x float> %DVec, ptr %D, align 4
+ ret void
+}
+;; CHECK: mul_and_sub_1
+;; CHECK: vsub(v{{[0-9]+}}.sf,v{{[0-9]+}}.qf32)
+
+
+define void @mul_and_sub_2(ptr readonly %A, ptr readonly %B, ptr readonly %C, ptr writeonly %D) {
+entry:
+ %AVec = load <32 x float>, ptr %A, align 4
+ %BVec = load <32 x float>, ptr %B, align 4
+ %CVec = load <32 x float>, ptr %C, align 4
+ %AtBVec = fmul <32 x float> %AVec, %BVec
+
+ %DVec = fsub <32 x float> %AtBVec, %CVec
+ store <32 x float> %DVec, ptr %D, align 4
+ ret void
+}
+;; CHECK: mul_and_sub_2
+;; CHECK: vsub(v{{[0-9]+}}.qf32,v{{[0-9]+}}.sf)
+
+
+define void @mul_and_sub_3(ptr readonly %A, ptr readonly %B, ptr readonly %C, ptr writeonly %D) {
+entry:
+ %AVec = load <64 x half>, ptr %A, align 4
+ %BVec = load <64 x half>, ptr %B, align 4
+ %CVec = load <64 x half>, ptr %C, align 4
+ %AtBVec = fmul <64 x half> %AVec, %BVec
+
+ %DVec = fsub <64 x half> %CVec, %AtBVec
+ store <64 x half> %DVec, ptr %D, align 4
+ ret void
+}
+;; CHECK: mul_and_sub_3
+;; CHECK: vsub(v{{[0-9]+}}.hf,v{{[0-9]+}}.qf16)
+
+
+define void @mul_and_sub_4(ptr readonly %A, ptr readonly %B, ptr readonly %C, ptr writeonly %D) {
+entry:
+ %AVec = load <64 x half>, ptr %A, align 4
+ %BVec = load <64 x half>, ptr %B, align 4
+ %CVec = load <64 x half>, ptr %C, align 4
+ %AtBVec = fmul <64 x half> %AVec, %BVec
+
+ %DVec = fsub <64 x half> %AtBVec, %CVec
+ store <64 x half> %DVec, ptr %D, align 4
+ ret void
+}
+;; CHECK: mul_and_sub_4
+;; CHECK: vsub(v{{[0-9]+}}.qf16,v{{[0-9]+}}.hf)
diff --git a/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll b/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll
index c16370c..527f27e 100644
--- a/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll
+++ b/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll
@@ -2,7 +2,7 @@
; type as first parameter instead of a sf type without
; any conversion instruction of type sf = qf32
-; RUN: llc -mtriple=hexagon < %s -o - | FileCheck %s
+; RUN: llc -mtriple=hexagon -mattr=+hvx-length128b,+hvxv75,+v75 < %s -o - | FileCheck %s
; CHECK: [[V2:v[0-9]+]] = vxor([[V2]],[[V2]])
; CHECK: [[V0:v[0-9]+]].qf32 = vmpy([[V0]].sf,[[V2]].sf)
@@ -17,5 +17,3 @@ entry:
store <64 x half> %conv17.ripple.vectorized, ptr %out_ptr, align 2
ret void
}
-
-attributes #0 = { "target-features"="+hvx-length128b,+hvxv75,+v75,-long-calls,-small-data" }
diff --git a/llvm/test/CodeGen/Hexagon/vect-qfp.mir b/llvm/test/CodeGen/Hexagon/vect-qfp.mir
new file mode 100644
index 0000000..6909591
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect-qfp.mir
@@ -0,0 +1,202 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \
+# RUN: -run-pass hexagon-qfp-optimizer -disable-qfp-opt-mul=false %s -o - | FileCheck %s --check-prefix=MUL-ENABLED
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \
+# RUN: -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s --check-prefix=DEFAULT
+# MUL-ENABLED-LABEL: name: qfpAdd32
+# MUL-ENABLED: V6_vconv_sf_qf32
+# MUL-ENABLED-NEXT: V6_vadd_qf32_mix
+# MUL-ENABLED-NEXT: V6_vconv_sf_qf32
+# MUL-ENABLED-NEXT: V6_vS32Ub_ai
+# MUL-ENABLED-NEXT: V6_vadd_qf32
+# DEFAULT-LABEL: name: qfpAdd32
+# DEFAULT: V6_vconv_sf_qf32
+# DEFAULT-NEXT: V6_vadd_qf32_mix
+# DEFAULT-NEXT: V6_vconv_sf_qf32
+# DEFAULT-NEXT: V6_vS32Ub_ai
+# DEFAULT-NEXT: V6_vadd_qf32
+---
+name: qfpAdd32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r2
+ %3:intregs = COPY $r3
+ %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+ %6:hvxvr = V6_vadd_sf %4:hvxvr, %5:hvxvr
+ %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+ %8:hvxvr = V6_vadd_sf %5:hvxvr, %7:hvxvr
+ %9:hvxvr = V6_vconv_sf_qf32 %8:hvxvr
+ V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr
+ %10:hvxvr = V6_vadd_sf %7:hvxvr, %9:hvxvr
+ %11:hvxvr = V6_vconv_sf_qf32 %10:hvxvr
+ V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr
+...
+# MUL-ENABLED-LABEL: name: qfpAdd16
+# MUL-ENABLED: V6_vconv_hf_qf16
+# MUL-ENABLED-NEXT: V6_vadd_qf16_mix
+# MUL-ENABLED-NEXT: V6_vconv_hf_qf16
+# MUL-ENABLED-NEXT: V6_vS32Ub_ai
+# MUL-ENABLED-NEXT: V6_vadd_qf16
+# DEFAULT-LABEL: name: qfpAdd16
+# DEFAULT: V6_vconv_hf_qf16
+# DEFAULT-NEXT: V6_vadd_qf16_mix
+# DEFAULT-NEXT: V6_vconv_hf_qf16
+# DEFAULT-NEXT: V6_vS32Ub_ai
+# DEFAULT-NEXT: V6_vadd_qf16
+---
+name: qfpAdd16
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r2
+ %3:intregs = COPY $r3
+ %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+ %6:hvxvr = V6_vadd_hf %4:hvxvr, %5:hvxvr
+ %7:hvxvr = V6_vconv_hf_qf16 %6:hvxvr
+ %8:hvxvr = V6_vadd_hf %5:hvxvr, %7:hvxvr
+ %9:hvxvr = V6_vconv_hf_qf16 %8:hvxvr
+ V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr
+ %10:hvxvr = V6_vadd_hf %7:hvxvr, %9:hvxvr
+ %11:hvxvr = V6_vconv_hf_qf16 %10:hvxvr
+ V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr
+...
+# MUL-ENABLED-LABEL: name: qfpSub32
+# MUL-ENABLED: V6_vconv_sf_qf32
+# MUL-ENABLED-NEXT: V6_vsub_qf32_mix
+# MUL-ENABLED-NEXT: V6_vconv_sf_qf32
+# MUL-ENABLED-NEXT: V6_vS32Ub_ai
+# MUL-ENABLED-NEXT: V6_vsub_qf32
+# DEFAULT-LABEL: name: qfpSub32
+# DEFAULT: V6_vconv_sf_qf32
+# DEFAULT-NEXT: V6_vsub_qf32_mix
+# DEFAULT-NEXT: V6_vconv_sf_qf32
+# DEFAULT-NEXT: V6_vS32Ub_ai
+# DEFAULT-NEXT: V6_vsub_qf32
+---
+name: qfpSub32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r2
+ %3:intregs = COPY $r3
+ %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+ %6:hvxvr = V6_vsub_sf %4:hvxvr, %5:hvxvr
+ %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+ %8:hvxvr = V6_vsub_sf %7:hvxvr, %5:hvxvr
+ %9:hvxvr = V6_vconv_sf_qf32 %8:hvxvr
+ V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr
+ %10:hvxvr = V6_vsub_sf %7:hvxvr, %9:hvxvr
+ %11:hvxvr = V6_vconv_sf_qf32 %10:hvxvr
+ V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr
+...
+# MUL-ENABLED-LABEL: name: qfpSub16
+# MUL-ENABLED: V6_vconv_hf_qf16
+# MUL-ENABLED-NEXT: V6_vsub_qf16_mix
+# MUL-ENABLED-NEXT: V6_vconv_hf_qf16
+# MUL-ENABLED-NEXT: V6_vS32Ub_ai
+# MUL-ENABLED-NEXT: V6_vsub_qf16
+# DEFAULT-LABEL: name: qfpSub16
+# DEFAULT: V6_vconv_hf_qf16
+# DEFAULT-NEXT: V6_vsub_qf16_mix
+# DEFAULT-NEXT: V6_vconv_hf_qf16
+# DEFAULT-NEXT: V6_vS32Ub_ai
+# DEFAULT-NEXT: V6_vsub_qf16
+---
+name: qfpSub16
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r2
+ %3:intregs = COPY $r3
+ %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+ %6:hvxvr = V6_vsub_hf %4:hvxvr, %5:hvxvr
+ %7:hvxvr = V6_vconv_hf_qf16 %6:hvxvr
+ %8:hvxvr = V6_vsub_hf %7:hvxvr, %5:hvxvr
+ %9:hvxvr = V6_vconv_hf_qf16 %8:hvxvr
+ V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr
+ %10:hvxvr = V6_vsub_hf %7:hvxvr, %9:hvxvr
+ %11:hvxvr = V6_vconv_hf_qf16 %10:hvxvr
+ V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr
+...
+# MUL-ENABLED-LABEL: name: qfpMul32
+# MUL-ENABLED: V6_vmpy_qf32_sf
+# MUL-ENABLED-NEXT: V6_vconv_sf_qf32
+# MUL-ENABLED-NEXT: V6_vmpy_qf32_sf
+# MUL-ENABLED-NEXT: V6_vconv_sf_qf32
+# MUL-ENABLED-NEXT: V6_vmpy_qf32
+# MUL-ENABLED-NEXT: V6_vS32Ub_ai
+# DEFAULT-LABEL: name: qfpMul32
+# DEFAULT: V6_vmpy_qf32_sf
+# DEFAULT-NEXT: V6_vconv_sf_qf32
+# DEFAULT-NEXT: V6_vmpy_qf32_sf
+# DEFAULT-NEXT: V6_vconv_sf_qf32
+# DEFAULT-NEXT: V6_vmpy_qf32_sf
+# DEFAULT-NEXT: V6_vS32Ub_ai
+---
+name: qfpMul32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r2
+ %3:intregs = COPY $r3
+ %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+ %6:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+ %7:hvxvr = V6_vmpy_qf32_sf %4:hvxvr, %5:hvxvr
+ %8:hvxvr = V6_vconv_sf_qf32 %7:hvxvr
+ %9:hvxvr = V6_vmpy_qf32_sf %5:hvxvr, %6:hvxvr
+ %10:hvxvr = V6_vconv_sf_qf32 %9:hvxvr
+ %11:hvxvr = V6_vmpy_qf32_sf %8:hvxvr, %10:hvxvr
+ V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr
+...
+# MUL-ENABLED-LABEL: name: qfpMul16
+# MUL-ENABLED: V6_vconv_hf_qf16
+# MUL-ENABLED-NEXT: V6_vmpy_qf16_mix_hf
+# MUL-ENABLED-NEXT: V6_vconv_hf_qf16
+# MUL-ENABLED-NEXT: V6_vS32Ub_ai
+# MUL-ENABLED-NEXT: V6_vmpy_qf16
+# DEFAULT-LABEL: name: qfpMul16
+# DEFAULT: V6_vconv_hf_qf16
+# DEFAULT-NEXT: V6_vmpy_qf16_hf
+# DEFAULT-NEXT: V6_vconv_hf_qf16
+# DEFAULT-NEXT: V6_vS32Ub_ai
+# DEFAULT-NEXT: V6_vmpy_qf16_hf
+---
+name: qfpMul16
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ %0:intregs = COPY $r0
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r2
+ %3:intregs = COPY $r3
+ %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+ %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+ %6:hvxvr = V6_vmpy_qf16_hf %4:hvxvr, %5:hvxvr
+ %7:hvxvr = V6_vconv_hf_qf16 %6:hvxvr
+ %8:hvxvr = V6_vmpy_qf16_hf %5:hvxvr, %7:hvxvr
+ %9:hvxvr = V6_vconv_hf_qf16 %8:hvxvr
+ V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr
+ %10:hvxvr = V6_vmpy_qf16_hf %7:hvxvr, %9:hvxvr
+ %11:hvxvr = V6_vconv_hf_qf16 %10:hvxvr
+ V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr
diff --git a/llvm/test/CodeGen/Hexagon/vect/vect-qfp-unary.mir b/llvm/test/CodeGen/Hexagon/vect/vect-qfp-unary.mir
new file mode 100644
index 0000000..482edc8
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect/vect-qfp-unary.mir
@@ -0,0 +1,97 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \
+# RUN: -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s
+
+
+# CHECK: name: qfp_vilog32
+# CHECK: V6_vilog2_qf32
+---
+name: qfp_vilog32
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ $v0 = V6_vL32Ub_ai $r0, 0
+ $v1 = V6_vconv_sf_qf32 $v0
+ $v2 = V6_vilog2_sf $v1
+ V6_vS32Ub_ai $r2, 0, $v2
+...
+
+# CHECK-LABEL: name: qfp_vilog16
+# CHECK: V6_vilog2_qf16
+---
+name: qfp_vilog16
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ $v0 = V6_vL32Ub_ai $r0, 0
+ $v1 = V6_vconv_hf_qf16 $v0
+ $v2 = V6_vilog2_hf $v1
+ V6_vS32Ub_ai $r2, 0, $v2
+...
+
+# CHECK: name: qfp_vneg32
+# CHECK: V6_vneg_qf32_qf32
+---
+name: qfp_vneg32
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ $v0 = V6_vL32Ub_ai $r0, 0
+ $v1 = V6_vconv_sf_qf32 $v0
+ $v2 = V6_vneg_qf32_sf $v1
+ $v3 = V6_vconv_sf_qf32 $v2
+ V6_vS32Ub_ai $r2, 0, $v3
+...
+
+# CHECK-LABEL: name: qfp_vneg16
+# CHECK: V6_vneg_qf16_qf16
+---
+name: qfp_vneg16
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ $v0 = V6_vL32Ub_ai $r0, 0
+ $v1 = V6_vconv_hf_qf16 $v0
+ $v2 = V6_vneg_qf16_hf $v1
+ $v3 = V6_vconv_hf_qf16 $v2
+ V6_vS32Ub_ai $r2, 0, $v3
+...
+
+# CHECK: name: qfp_vabs32
+# CHECK: V6_vabs_qf32_qf32
+---
+name: qfp_vabs32
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ $v0 = V6_vL32Ub_ai $r0, 0
+ $v1 = V6_vconv_sf_qf32 $v0
+ $v2 = V6_vabs_qf32_sf $v1
+ $v3 = V6_vconv_sf_qf32 $v2
+ V6_vS32Ub_ai $r2, 0, $v3
+...
+
+# CHECK-LABEL: name: qfp_vabs16
+# CHECK: V6_vabs_qf16_qf16
+---
+name: qfp_vabs16
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $r0, $r1, $r2, $r3
+ $v0 = V6_vL32Ub_ai $r0, 0
+ $v1 = V6_vconv_hf_qf16 $v0
+ $v2 = V6_vabs_qf16_hf $v1
+ $v3 = V6_vconv_hf_qf16 $v2
+ V6_vS32Ub_ai $r2, 0, $v3
+...
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
index 000c67ef..8af4277 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
@@ -1531,6 +1531,7 @@ Key: RDSSPQ: [ 0.00 0.00 ]
Key: RDTSC: [ 0.00 0.00 ]
Key: RDTSCP: [ 0.00 0.00 ]
Key: REG_SEQUENCE: [ 0.00 0.00 ]
+Key: RELOC_NONE: [ 0.00 0.00 ]
Key: REPNE_PREFIX: [ 0.00 0.00 ]
Key: REP_MOVSB: [ 0.00 0.00 ]
Key: REP_MOVSD: [ 0.00 0.00 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
index bb72886..e133426 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
@@ -1531,6 +1531,7 @@ Key: RDSSPQ: [ 0.00 0.00 ]
Key: RDTSC: [ 0.00 0.00 ]
Key: RDTSCP: [ 0.00 0.00 ]
Key: REG_SEQUENCE: [ 0.00 0.00 ]
+Key: RELOC_NONE: [ 0.00 0.00 ]
Key: REPNE_PREFIX: [ 0.00 0.00 ]
Key: REP_MOVSB: [ 0.00 0.00 ]
Key: REP_MOVSD: [ 0.00 0.00 ]
diff --git a/llvm/test/CodeGen/PowerPC/annotate-metadata.ll b/llvm/test/CodeGen/PowerPC/annotate-metadata.ll
new file mode 100644
index 0000000..4149b56
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/annotate-metadata.ll
@@ -0,0 +1,15 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc-ibm-aix-xcoff < \
+; RUN: %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc64le-unknown-linux < \
+; RUN: %s | FileCheck %s
+
+@.str = private unnamed_addr constant [12 x i8] c"MY_METADATA\00", section "llvm.metadata"
+@.str.1 = private unnamed_addr constant [10 x i8] c"my_file.c\00", section "llvm.metadata"
+@global.annotations = appending global [3 x { ptr, ptr, ptr, i32, ptr }] [{ ptr, ptr, ptr, i32, ptr } { ptr @a, ptr @.str, ptr @.str.1, i32 100, ptr null }, { ptr, ptr, ptr, i32, ptr } { ptr @b, ptr @.str, ptr @.str.1, i32 200, ptr null }, { ptr, ptr, ptr, i32, ptr } { ptr @c, ptr @.str, ptr @.str.1, i32 300, ptr null }], section "llvm.metadata"
+
+@a = global i32 1
+@b = global i32 2
+@c = global i32 3
+
+; CHECK-NOT: metadata
+; CHECK-NOT: annotations
diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
index d666832..c79fb0f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
@@ -460,9 +460,9 @@ define void @reserved_call_frame(i64 %n) #0 {
; RV64I-NEXT: lui a0, 1
; RV64I-NEXT: sub sp, sp, a0
; RV64I-NEXT: sd zero, 0(sp)
-; RV64I-NEXT: .cfi_def_cfa_offset 4096
+; RV64I-NEXT: .cfi_def_cfa_offset 6128
; RV64I-NEXT: addi sp, sp, -48
-; RV64I-NEXT: .cfi_def_cfa_offset 4144
+; RV64I-NEXT: .cfi_def_cfa_offset 6176
; RV64I-NEXT: lui a0, 1
; RV64I-NEXT: add a0, sp, a0
; RV64I-NEXT: call callee_stack_args
@@ -485,9 +485,9 @@ define void @reserved_call_frame(i64 %n) #0 {
; RV32I-NEXT: lui a0, 1
; RV32I-NEXT: sub sp, sp, a0
; RV32I-NEXT: sw zero, 0(sp)
-; RV32I-NEXT: .cfi_def_cfa_offset 4096
+; RV32I-NEXT: .cfi_def_cfa_offset 6128
; RV32I-NEXT: addi sp, sp, -80
-; RV32I-NEXT: .cfi_def_cfa_offset 4176
+; RV32I-NEXT: .cfi_def_cfa_offset 6208
; RV32I-NEXT: lui a0, 1
; RV32I-NEXT: addi a0, a0, 36
; RV32I-NEXT: add a0, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
index ad2ed47..0341862 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
@@ -570,7 +570,82 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i8i32_multiple_users(ptr %x, ptr %y,
ret <vscale x 2 x i32> %i
}
+define <vscale x 4 x i32> @mismatched_extend_sub_add(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) {
+; FOLDING-LABEL: mismatched_extend_sub_add:
+; FOLDING: # %bb.0:
+; FOLDING-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; FOLDING-NEXT: vzext.vf2 v10, v8
+; FOLDING-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; FOLDING-NEXT: vwsub.wv v12, v10, v9
+; FOLDING-NEXT: vwadd.wv v10, v10, v9
+; FOLDING-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; FOLDING-NEXT: vmul.vv v8, v12, v10
+; FOLDING-NEXT: ret
+ %a = zext <vscale x 4 x i16> %x to <vscale x 4 x i32>
+ %b = sext <vscale x 4 x i16> %y to <vscale x 4 x i32>
+ %c = sub <vscale x 4 x i32> %a, %b
+ %d = add <vscale x 4 x i32> %a, %b
+ %e = mul <vscale x 4 x i32> %c, %d
+ ret <vscale x 4 x i32> %e
+}
+
+; FIXME: this should remove the vsext
+define <vscale x 4 x i32> @mismatched_extend_sub_add_commuted(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) {
+; FOLDING-LABEL: mismatched_extend_sub_add_commuted:
+; FOLDING: # %bb.0:
+; FOLDING-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; FOLDING-NEXT: vzext.vf2 v10, v8
+; FOLDING-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; FOLDING-NEXT: vwsub.wv v12, v10, v9
+; FOLDING-NEXT: vwadd.wv v10, v10, v9
+; FOLDING-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; FOLDING-NEXT: vmul.vv v8, v12, v10
+; FOLDING-NEXT: ret
+ %a = zext <vscale x 4 x i16> %x to <vscale x 4 x i32>
+ %b = sext <vscale x 4 x i16> %y to <vscale x 4 x i32>
+ %c = sub <vscale x 4 x i32> %a, %b
+ %d = add <vscale x 4 x i32> %b, %a
+ %e = mul <vscale x 4 x i32> %c, %d
+ ret <vscale x 4 x i32> %e
+}
+define <vscale x 4 x i32> @mismatched_extend_add_sub(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) {
+; FOLDING-LABEL: mismatched_extend_add_sub:
+; FOLDING: # %bb.0:
+; FOLDING-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; FOLDING-NEXT: vzext.vf2 v10, v8
+; FOLDING-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; FOLDING-NEXT: vwadd.wv v12, v10, v9
+; FOLDING-NEXT: vwsub.wv v10, v10, v9
+; FOLDING-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; FOLDING-NEXT: vmul.vv v8, v12, v10
+; FOLDING-NEXT: ret
+ %a = zext <vscale x 4 x i16> %x to <vscale x 4 x i32>
+ %b = sext <vscale x 4 x i16> %y to <vscale x 4 x i32>
+ %c = add <vscale x 4 x i32> %a, %b
+ %d = sub <vscale x 4 x i32> %a, %b
+ %e = mul <vscale x 4 x i32> %c, %d
+ ret <vscale x 4 x i32> %e
+}
+
+define <vscale x 4 x i32> @mismatched_extend_add_sub_commuted(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) {
+; FOLDING-LABEL: mismatched_extend_add_sub_commuted:
+; FOLDING: # %bb.0:
+; FOLDING-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; FOLDING-NEXT: vzext.vf2 v10, v8
+; FOLDING-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; FOLDING-NEXT: vwadd.wv v12, v10, v9
+; FOLDING-NEXT: vwsub.wv v10, v10, v9
+; FOLDING-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; FOLDING-NEXT: vmul.vv v8, v12, v10
+; FOLDING-NEXT: ret
+ %a = zext <vscale x 4 x i16> %x to <vscale x 4 x i32>
+ %b = sext <vscale x 4 x i16> %y to <vscale x 4 x i32>
+ %c = add <vscale x 4 x i32> %a, %b
+ %d = sub <vscale x 4 x i32> %a, %b
+ %e = mul <vscale x 4 x i32> %c, %d
+ ret <vscale x 4 x i32> %e
+}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; RV32: {{.*}}
diff --git a/llvm/test/CodeGen/SystemZ/vec-load-element.ll b/llvm/test/CodeGen/SystemZ/vec-load-element.ll
index 2baaed1..9bef279 100644
--- a/llvm/test/CodeGen/SystemZ/vec-load-element.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-load-element.ll
@@ -5,8 +5,8 @@
; CHECK-LABEL: .LBB0_1:
; CHECK-NOT: l %r
; CHECK-NOT: vlvgf
-; CHECK: pfd
-; CHECK: vlef
+; CHECK-DAG: pfd
+; CHECK-DAG: vlef
%type0 = type { i32, [400 x i8], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
@Mem = external global [150 x %type0], align 4
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
index ee2e58f..a1771f9 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
@@ -98,28 +98,29 @@ body: |
; CHECK-LABEL: name: foo
; CHECK: liveins: $q0, $r0, $r1, $r2, $lr
- ; CHECK: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr
- ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
- ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
- ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
- ; CHECK: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg
- ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7
- ; CHECK: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2)
- ; CHECK: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1)
- ; CHECK: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0)
- ; CHECK: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr {
- ; CHECK: MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr
- ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4)
- ; CHECK: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4)
- ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4)
- ; CHECK: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4)
- ; CHECK: }
- ; CHECK: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 {
- ; CHECK: MVE_VPST 4, implicit $vpr
- ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4)
- ; CHECK: MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4)
- ; CHECK: }
- ; CHECK: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 8
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $lr, -4
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $r7, -8
+ ; CHECK-NEXT: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg
+ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $r7
+ ; CHECK-NEXT: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2)
+ ; CHECK-NEXT: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1)
+ ; CHECK-NEXT: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0)
+ ; CHECK-NEXT: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr :: (load (s128) from %ir.src, align 4), (store (s128) into %ir.dest, align 4), (load (s128) from %ir.src2, align 4), (store (s128) into %ir.dest2, align 4) {
+ ; CHECK-NEXT: MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr
+ ; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4)
+ ; CHECK-NEXT: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4)
+ ; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4)
+ ; CHECK-NEXT: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4)
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 :: (load (s128) from %ir.src3, align 4), (store (s128) into %ir.dest3, align 4) {
+ ; CHECK-NEXT: MVE_VPST 4, implicit $vpr
+ ; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4)
+ ; CHECK-NEXT: MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4)
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0
$sp = frame-setup t2STMDB_UPD $sp, 14, $noreg, killed $r7, killed $lr
frame-setup CFI_INSTRUCTION def_cfa_offset 8
frame-setup CFI_INSTRUCTION offset $lr, -4
diff --git a/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll b/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll
new file mode 100644
index 0000000..841c9a6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=CHECK
+
+define void @test_reloc_none() {
+; CHECK-LABEL: test_reloc_none:
+; CHECK: # %bb.0:
+; CHECK-NEXT: .Lreloc_none0:
+; CHECK-NEXT: .reloc .Lreloc_none0, BFD_RELOC_NONE, foo
+; CHECK-NEXT: retq
+ call void @llvm.reloc.none(metadata !"foo")
+ ret void
+}
+
+declare void @llvm.reloc.none(metadata)
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 32d2252..9d31c29 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1056,26 +1056,45 @@ define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind
; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: chain_reset_i256:
-; X64: # %bb.0:
-; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
-; X64-NEXT: movl $-2, %eax
-; X64-NEXT: roll %cl, %eax
-; X64-NEXT: shrl $3, %ecx
-; X64-NEXT: andl $28, %ecx
-; X64-NEXT: andl %eax, (%rdi,%rcx)
-; X64-NEXT: movq (%rdi), %rcx
-; X64-NEXT: movq 8(%rdi), %r8
-; X64-NEXT: orq 24(%rdi), %r8
-; X64-NEXT: movq 16(%rdi), %rdi
-; X64-NEXT: orq %rcx, %rdi
-; X64-NEXT: movl (%rsi), %eax
-; X64-NEXT: movl %ecx, (%rsi)
-; X64-NEXT: movl (%rdx), %ecx
-; X64-NEXT: addl %ecx, %eax
-; X64-NEXT: orq %r8, %rdi
-; X64-NEXT: cmovnel %ecx, %eax
-; X64-NEXT: retq
+; SSE-LABEL: chain_reset_i256:
+; SSE: # %bb.0:
+; SSE-NEXT: # kill: def $ecx killed $ecx def $rcx
+; SSE-NEXT: movl $-2, %eax
+; SSE-NEXT: roll %cl, %eax
+; SSE-NEXT: shrl $3, %ecx
+; SSE-NEXT: andl $28, %ecx
+; SSE-NEXT: andl %eax, (%rdi,%rcx)
+; SSE-NEXT: movq (%rdi), %rcx
+; SSE-NEXT: movq 8(%rdi), %r8
+; SSE-NEXT: orq 24(%rdi), %r8
+; SSE-NEXT: movq 16(%rdi), %rdi
+; SSE-NEXT: orq %rcx, %rdi
+; SSE-NEXT: movl (%rsi), %eax
+; SSE-NEXT: movl %ecx, (%rsi)
+; SSE-NEXT: movl (%rdx), %ecx
+; SSE-NEXT: addl %ecx, %eax
+; SSE-NEXT: orq %r8, %rdi
+; SSE-NEXT: cmovnel %ecx, %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: chain_reset_i256:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX-NEXT: movl $-2, %eax
+; AVX-NEXT: roll %cl, %eax
+; AVX-NEXT: shrl $3, %ecx
+; AVX-NEXT: andl $28, %ecx
+; AVX-NEXT: andl %eax, (%rdi,%rcx)
+; AVX-NEXT: vmovdqu (%rdi), %ymm0
+; AVX-NEXT: movl (%rdi), %ecx
+; AVX-NEXT: movl (%rsi), %eax
+; AVX-NEXT: movl %ecx, (%rsi)
+; AVX-NEXT: movl (%rdx), %ecx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vptest %ymm0, %ymm0
+; AVX-NEXT: cmovnel %ecx, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%rem = and i32 %position, 255
%ofs = zext nneg i32 %rem to i256
%bit = shl nuw i256 1, %ofs
diff --git a/llvm/test/DebugInfo/extradata-node-reference.ll b/llvm/test/DebugInfo/extradata-node-reference.ll
new file mode 100644
index 0000000..0ec9312
--- /dev/null
+++ b/llvm/test/DebugInfo/extradata-node-reference.ll
@@ -0,0 +1,101 @@
+;; Test verifies that node reference in the extraData field are handled correctly
+;; when used with tags like DW_TAG_member, DW_TAG_inheritance etc.
+
+; REQUIRES: object-emission
+; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s -check-prefix=CHECK-IR
+; RUN: verify-uselistorder %s
+
+; Example 1: BitField with storage offset (extraData: i64 0)
+%struct.BitField = type { i8 }
+@bf = global %struct.BitField zeroinitializer, !dbg !9
+
+; Example 2: Static member with constant value (extraData: i32 42)
+%struct.Static = type { i32 }
+@st = global %struct.Static zeroinitializer, !dbg !16
+
+; Example 3: Discriminant value for variant (extraData: i32 100)
+%union.Variant = type { [8 x i8] }
+@var = global %union.Variant zeroinitializer, !dbg !24
+
+; Example 4: Inheritance VBPtr offset (extraData: i32 0)
+%class.Derived = type { i32 }
+@der = global %class.Derived zeroinitializer, !dbg !35
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 11.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !8)
+!1 = !DIFile(filename: "test.cpp", directory: ".")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 1, !"wchar_size", i32 4}
+!4 = !{i32 2, !"Dwarf Version", i32 5}
+!8 = !{!9, !16, !24, !35}
+
+; extraData node definitions
+!15 = !{i64 0} ; BitField storage offset
+!22 = !{i32 42} ; Static member constant value
+!33 = !{i32 100} ; Discriminant value
+!41 = !{i32 0} ; VBPtr offset
+
+; CHECK-IR: !9 = !DIDerivedType(tag: DW_TAG_member, name: "const_val", scope: !7, file: !3, line: 11, baseType: !10, flags: DIFlagStaticMember, extraData: !12)
+; CHECK-IR: !12 = !{i32 42}
+; CHECK-IR: !20 = !DIDerivedType(tag: DW_TAG_member, name: "variant_some", scope: !17, file: !3, baseType: !11, size: 32, extraData: !21)
+; CHECK-IR: !21 = !{i32 100}
+; CHECK-IR: !27 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !25, baseType: !28, extraData: !29)
+; CHECK-IR: !29 = !{i32 0}
+; CHECK-IR: !32 = !DIDerivedType(tag: DW_TAG_member, name: "field", scope: !30, file: !3, line: 6, baseType: !11, size: 3, flags: DIFlagBitField, extraData: !33)
+; CHECK-IR: !33 = !{i64 0}
+
+; CHECK: {{.*}} DW_TAG_variable
+; CHECK: {{.*}} DW_AT_name ("bf")
+; CHECK: {{.*}} DW_TAG_member
+; CHECK: {{.*}} DW_AT_name ("field")
+; === BitField: extraData holds storage offset ===
+!9 = !DIGlobalVariableExpression(var: !10, expr: !DIExpression())
+!10 = distinct !DIGlobalVariable(name: "bf", scope: !0, file: !1, line: 5, type: !11, isLocal: false, isDefinition: true)
+!11 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "BitField", file: !1, line: 5, size: 8, elements: !12)
+!12 = !{!13}
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "field", scope: !11, file: !1, line: 6, baseType: !14, size: 3, flags: DIFlagBitField, extraData: !15)
+!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+
+; CHECK: {{.*}} DW_TAG_variable
+; CHECK: {{.*}} DW_AT_name ("st")
+; CHECK: {{.*}} DW_TAG_member
+; CHECK: {{.*}} DW_AT_name ("const_val")
+; CHECK: {{.*}} DW_AT_const_value (42)
+; === Static Member: extraData holds constant value ===
+!16 = !DIGlobalVariableExpression(var: !17, expr: !DIExpression())
+!17 = distinct !DIGlobalVariable(name: "st", scope: !0, file: !1, line: 10, type: !18, isLocal: false, isDefinition: true)
+!18 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Static", file: !1, line: 10, size: 32, elements: !19)
+!19 = !{!20}
+!20 = !DIDerivedType(tag: DW_TAG_member, name: "const_val", scope: !18, file: !1, line: 11, baseType: !21, flags: DIFlagStaticMember, extraData: !22)
+!21 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !14)
+
+; CHECK: {{.*}} DW_TAG_variable
+; CHECK: {{.*}} DW_AT_name ("var")
+; CHECK: {{.*}} DW_TAG_member
+; CHECK: {{.*}} DW_AT_name ("variant_none")
+; CHECK: {{.*}} DW_AT_discr_value (0x64)
+; === Discriminant: extraData holds discriminant value ===
+!24 = !DIGlobalVariableExpression(var: !25, expr: !DIExpression())
+!25 = distinct !DIGlobalVariable(name: "var", scope: !0, file: !1, line: 15, type: !26, isLocal: false, isDefinition: true)
+!26 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Variant", file: !1, line: 15, size: 128, elements: !27)
+!27 = !{!28}
+!28 = !DICompositeType(tag: DW_TAG_variant_part, scope: !26, file: !1, size: 128, elements: !29, discriminator: !30)
+!29 = !{!31, !32}
+!30 = !DIDerivedType(tag: DW_TAG_member, scope: !28, file: !1, baseType: !14, size: 32, align: 32, flags: DIFlagArtificial)
+!31 = !DIDerivedType(tag: DW_TAG_member, name: "variant_none", scope: !28, file: !1, baseType: !14, size: 32)
+!32 = !DIDerivedType(tag: DW_TAG_member, name: "variant_some", scope: !28, file: !1, baseType: !14, size: 32, extraData: !33)
+
+; CHECK: {{.*}} DW_TAG_variable
+; CHECK: {{.*}} DW_AT_name ("der")
+; CHECK: {{.*}} DW_TAG_inheritance
+; CHECK: {{.*}} DW_AT_type ({{.*}} "Base")
+; === Inheritance: extraData holds VBPtr offset ===
+!35 = !DIGlobalVariableExpression(var: !36, expr: !DIExpression())
+!36 = distinct !DIGlobalVariable(name: "der", scope: !0, file: !1, line: 20, type: !37, isLocal: false, isDefinition: true)
+!37 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Derived", file: !1, line: 20, size: 32, elements: !38)
+!38 = !{!39}
+!39 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !37, baseType: !40, extraData: !41)
+!40 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Base", file: !1, line: 19, size: 32)
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
index 18960b4..3170f2c 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
@@ -96,7 +96,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
// CHECK: const uint8_t *GenMyCombiner::getMatchTable() const {
// CHECK-NEXT: constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT: /* 0 */ GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(99), GIMT_Encode2(211), /*)*//*default:*//*Label 5*/ GIMT_Encode4(524),
+// CHECK-NEXT: /* 0 */ GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(100), GIMT_Encode2(212), /*)*//*default:*//*Label 5*/ GIMT_Encode4(524),
// CHECK-NEXT: /* 10 */ /*TargetOpcode::G_STORE*//*Label 0*/ GIMT_Encode4(458), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
// CHECK-NEXT: /* 182 */ /*TargetOpcode::G_SEXT*//*Label 1*/ GIMT_Encode4(476), GIMT_Encode4(0),
// CHECK-NEXT: /* 190 */ /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4(488), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
diff --git a/llvm/test/TableGen/RegClassByHwMode.td b/llvm/test/TableGen/RegClassByHwMode.td
index ca72cfb..a21a396 100644
--- a/llvm/test/TableGen/RegClassByHwMode.td
+++ b/llvm/test/TableGen/RegClassByHwMode.td
@@ -6,18 +6,21 @@
include "llvm/Target/Target.td"
-// INSTRINFO: #ifdef GET_INSTRINFO_ENUM
+// INSTRINFO: #ifdef GET_INSTRINFO_ENUM
// INSTRINFO-NEXT: #undef GET_INSTRINFO_ENUM
+// INSTRINFO-EMPTY:
// INSTRINFO-NEXT: namespace llvm::MyTarget {
+// INSTRINFO-EMPTY:
// INSTRINFO-NEXT: enum {
-// INSTRINFO-NEXT: PHI
-// INSTRINFO: };
-// INSTRINFO: enum RegClassByHwModeUses : uint16_t {
+// INSTRINFO-NEXT: PHI
+// INSTRINFO: };
+// INSTRINFO: enum RegClassByHwModeUses : uint16_t {
// INSTRINFO-NEXT: MyPtrRC,
// INSTRINFO-NEXT: XRegs_EvenIfRequired,
// INSTRINFO-NEXT: YRegs_EvenIfRequired,
// INSTRINFO-NEXT: };
-// INSTRINFO-NEXT: }
+// INSTRINFO-EMPTY:
+// INSTRINFO-NEXT: } // namespace llvm::MyTarget
// INSTRINFO: { MyTarget::XRegsRegClassID, 0, MCOI::OPERAND_REGISTER, 0 },
// INSTRINFO: { MyTarget::XRegs_EvenRegClassID, 0, MCOI::OPERAND_REGISTER, 0 },
diff --git a/llvm/test/TableGen/get-named-operand-idx.td b/llvm/test/TableGen/get-named-operand-idx.td
index e6f6331..8bb4f2f 100644
--- a/llvm/test/TableGen/get-named-operand-idx.td
+++ b/llvm/test/TableGen/get-named-operand-idx.td
@@ -50,7 +50,9 @@ def InstD : InstBase {
// CHECK-LABEL: #ifdef GET_INSTRINFO_OPERAND_ENUM
// CHECK-NEXT: #undef GET_INSTRINFO_OPERAND_ENUM
+// CHECK-EMPTY:
// CHECK-NEXT: namespace llvm::MyNamespace {
+// CHECK-EMPTY:
// CHECK-NEXT: enum class OpName : uint8_t {
// CHECK-NEXT: a = 0,
// CHECK-NEXT: b = 1,
@@ -62,12 +64,16 @@ def InstD : InstBase {
// CHECK-EMPTY:
// CHECK-NEXT: LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, OpName Name);
// CHECK-NEXT: LLVM_READONLY OpName getOperandIdxName(uint16_t Opcode, int16_t Idx);
-// CHECK-NEXT: } // end namespace llvm::MyNamespace
-// CHECK-NEXT: #endif //GET_INSTRINFO_OPERAND_ENUM
+// CHECK-EMPTY:
+// CHECK-NEXT: } // namespace llvm::MyNamespace
+// CHECK-EMPTY:
+// CHECK-NEXT: #endif // GET_INSTRINFO_OPERAND_ENUM
// CHECK-LABEL: #ifdef GET_INSTRINFO_NAMED_OPS
// CHECK-NEXT: #undef GET_INSTRINFO_NAMED_OPS
+// CHECK-EMPTY:
// CHECK-NEXT: namespace llvm::MyNamespace {
+// CHECK-EMPTY:
// CHECK-NEXT: LLVM_READONLY static uint8_t getInstructionIndexForOpLookup(uint16_t Opcode) {
// CHECK-NEXT: static constexpr uint8_t InstructionIndex[] = {
// CHECK-NEXT: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -89,7 +95,8 @@ def InstD : InstBase {
// CHECK-NEXT: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// CHECK-NEXT: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// CHECK-NEXT: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-// CHECK-NEXT: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0,
+// CHECK-NEXT: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2,
+// CHECK-NEXT: 0,
// CHECK-NEXT: };
// CHECK-NEXT: return InstructionIndex[Opcode];
// CHECK-NEXT: }
@@ -113,5 +120,7 @@ def InstD : InstBase {
// CHECK-NEXT: unsigned InstrIdx = getInstructionIndexForOpLookup(Opcode);
// CHECK-NEXT: return OperandMap[InstrIdx][(unsigned)Idx];
// CHECK-NEXT: }
-// CHECK-NEXT: } // end namespace llvm::MyNamespace
-// CHECK-NEXT: #endif //GET_INSTRINFO_NAMED_OPS
+// CHECK-EMPTY:
+// CHECK-NEXT: } // namespace llvm::MyNamespace
+// CHECK-EMPTY:
+// CHECK-NEXT: #endif // GET_INSTRINFO_NAMED_OPS
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll
index d80178fd..866487d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll
@@ -70,3 +70,28 @@ loop:
exit:
ret i32 %red.next
}
+
+define i16 @test_incomplete_chain_without_mul(ptr noalias %dst, ptr %A, ptr %B) #0 {
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %red = phi i16 [ 0, %entry ], [ %red.next, %loop ]
+ %l.a = load i8, ptr %A, align 1
+ %a.ext = zext i8 %l.a to i16
+ store i16 %a.ext, ptr %dst, align 2
+ %l.b = load i8, ptr %B, align 1
+ %b.ext = zext i8 %l.b to i16
+ %add = add i16 %red, %b.ext
+ %add.1 = add i16 %add, %a.ext
+ %red.next = add i16 %add.1, %b.ext
+ %iv.next = add i64 %iv, 1
+ %ec = icmp ult i64 %iv, 1024
+ br i1 %ec, label %loop, label %exit
+
+exit:
+ ret i16 %red.next
+}
+
+attributes #0 = { "target-cpu"="grace" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/copyable-child-node-used-outside.ll b/llvm/test/Transforms/SLPVectorizer/X86/copyable-child-node-used-outside.ll
new file mode 100644
index 0000000..6597519
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/copyable-child-node-used-outside.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define <4 x i32> @test() {
+; CHECK-LABEL: define <4 x i32> @test() {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 0 to i32
+; CHECK-NEXT: br label %[[BB1:.*]]
+; CHECK: [[BB1]]:
+; CHECK-NEXT: [[OR:%.*]] = or i32 [[TRUNC]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[TRUNC]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 0, i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 0>
+; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[OR]] to i64
+; CHECK-NEXT: br label %[[BB3:.*]]
+; CHECK: [[BB3]]:
+; CHECK-NEXT: ret <4 x i32> [[TMP3]]
+;
+bb:
+ %trunc = trunc i64 0 to i32
+ br label %bb1
+
+bb1:
+ %or = or i32 %trunc, 0
+ %zext = zext i32 %or to i64
+ %and = and i32 0, 0
+ %or2 = or i32 %trunc, 0
+ br label %bb3
+
+bb3:
+ %0 = insertelement <4 x i32> zeroinitializer, i32 %trunc, i32 0
+ %1 = insertelement <4 x i32> %0, i32 %and, i32 1
+ %2 = insertelement <4 x i32> %1, i32 %or2, i32 2
+ %3 = insertelement <4 x i32> %2, i32 %or, i32 3
+ ret <4 x i32> %3
+}
diff --git a/llvm/test/Verifier/reloc-none.ll b/llvm/test/Verifier/reloc-none.ll
new file mode 100644
index 0000000..9c96799
--- /dev/null
+++ b/llvm/test/Verifier/reloc-none.ll
@@ -0,0 +1,13 @@
+; RUN: not llvm-as -disable-output 2>&1 %s | FileCheck %s
+
+; CHECK: llvm.reloc.none argument must be a metadata string
+; CHECK-NEXT: call void @llvm.reloc.none(metadata !0)
+
+define void @test_reloc_none_bad_arg() {
+ call void @llvm.reloc.none(metadata !0)
+ ret void
+}
+
+declare void @llvm.reloc.none(metadata)
+
+!0 = !{}
diff --git a/llvm/test/tools/dxil-dis/llvm_assume.ll b/llvm/test/tools/dxil-dis/llvm_assume.ll
deleted file mode 100644
index f5be66c..0000000
--- a/llvm/test/tools/dxil-dis/llvm_assume.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc --filetype=obj %s -o - | dxil-dis -o - | FileCheck %s
-
-target triple = "dxil-pc-shadermodel6.7-library"
-
-define void @test_llvm_assume(i1 %0) {
-; CHECK-LABEL: test_llvm_assume
-; CHECK-NEXT: tail call void @llvm.assume(i1 %0)
-tail call void @llvm.assume(i1 %0)
-ret void
-}
-
diff --git a/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt b/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt
index 141a56a..ec061ff 100644
--- a/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt
+++ b/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt
@@ -1,33 +1,33 @@
MAX_RELATION=4
-187 7051 1
-187 6948 2
-187 187 0
-187 7051 1
+187 7052 1
187 6949 2
+187 187 0
+187 7052 1
+187 6950 2
187 10 0
-10 7051 1
-10 7051 2
-10 7051 3
-10 6941 4
+10 7052 1
+10 7052 2
+10 7052 3
+10 6942 4
10 187 0
-187 6932 1
-187 7051 2
-187 1543 0
-1543 6862 1
-1543 6932 2
-187 7051 1
-187 6948 2
-187 187 0
-187 7051 1
+187 6933 1
+187 7052 2
+187 1544 0
+1544 6863 1
+1544 6933 2
+187 7052 1
187 6949 2
+187 187 0
+187 7052 1
+187 6950 2
187 601 0
-601 7051 1
-601 7051 2
-601 7051 3
-601 6941 4
+601 7052 1
+601 7052 2
+601 7052 3
+601 6942 4
601 187 0
-187 6932 1
-187 7051 2
-187 1543 0
-1543 6862 1
-1543 6932 2
+187 6933 1
+187 7052 2
+187 1544 0
+1544 6863 1
+1544 6933 2
diff --git a/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt b/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt
index dbbbbc7..1b90a8a 100644
--- a/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt
+++ b/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt
@@ -1,4 +1,4 @@
-7151
+7152
AAA 0
AAD 1
AADD 2
@@ -1532,5621 +1532,5622 @@ RDSSPQ 1529
RDTSC 1530
RDTSCP 1531
REG_SEQUENCE 1532
-REPNE_PREFIX 1533
-REP_MOVSB 1534
-REP_MOVSD 1535
-REP_MOVSQ 1536
-REP_MOVSW 1537
-REP_PREFIX 1538
-REP_STOSB 1539
-REP_STOSD 1540
-REP_STOSQ 1541
-REP_STOSW 1542
-RET 1543
-RETI 1544
-REX 1545
-RMPADJUST 1546
-RMPQUERY 1547
-RMPUPDATE 1548
-ROL 1549
-ROR 1550
-RORX 1551
-ROUNDPDmi 1552
-ROUNDPDri 1553
-ROUNDPSmi 1554
-ROUNDPSri 1555
-ROUNDSDmi 1556
-ROUNDSDmi_Int 1557
-ROUNDSDri 1558
-ROUNDSDri_Int 1559
-ROUNDSSmi 1560
-ROUNDSSmi_Int 1561
-ROUNDSSri 1562
-ROUNDSSri_Int 1563
-RSM 1564
-RSQRTPSm 1565
-RSQRTPSr 1566
-RSQRTSSm 1567
-RSQRTSSm_Int 1568
-RSQRTSSr 1569
-RSQRTSSr_Int 1570
-RSTORSSP 1571
-SAHF 1572
-SALC 1573
-SAR 1574
-SARX 1575
-SAVEPREVSSP 1576
-SBB 1577
-SCASB 1578
-SCASL 1579
-SCASQ 1580
-SCASW 1581
-SEAMCALL 1582
-SEAMOPS 1583
-SEAMRET 1584
-SEG_ALLOCA 1585
-SEH_BeginEpilogue 1586
-SEH_EndEpilogue 1587
-SEH_EndPrologue 1588
-SEH_PushFrame 1589
-SEH_PushReg 1590
-SEH_SaveReg 1591
-SEH_SaveXMM 1592
-SEH_SetFrame 1593
-SEH_StackAlign 1594
-SEH_StackAlloc 1595
-SEH_UnwindV 1596
-SEH_UnwindVersion 1597
-SENDUIPI 1598
-SERIALIZE 1599
-SETB_C 1600
-SETCCm 1601
-SETCCm_EVEX 1602
-SETCCr 1603
-SETCCr_EVEX 1604
-SETSSBSY 1605
-SETZUCCm 1606
-SETZUCCr 1607
-SFENCE 1608
-SGDT 1609
-SHA 1610
-SHL 1611
-SHLD 1612
-SHLDROT 1613
-SHLX 1614
-SHR 1615
-SHRD 1616
-SHRDROT 1617
-SHRX 1618
-SHUFPDrmi 1619
-SHUFPDrri 1620
-SHUFPSrmi 1621
-SHUFPSrri 1622
-SIDT 1623
-SKINIT 1624
-SLDT 1625
-SLWPCB 1626
-SMSW 1627
-SQRTPDm 1628
-SQRTPDr 1629
-SQRTPSm 1630
-SQRTPSr 1631
-SQRTSDm 1632
-SQRTSDm_Int 1633
-SQRTSDr 1634
-SQRTSDr_Int 1635
-SQRTSSm 1636
-SQRTSSm_Int 1637
-SQRTSSr 1638
-SQRTSSr_Int 1639
-SQRT_F 1640
-SQRT_Fp 1641
-SS_PREFIX 1642
-STAC 1643
-STACKALLOC_W_PROBING 1644
-STACKMAP 1645
-STATEPOINT 1646
-STC 1647
-STD 1648
-STGI 1649
-STI 1650
-STMXCSR 1651
-STOSB 1652
-STOSL 1653
-STOSQ 1654
-STOSW 1655
-STR 1656
-STRm 1657
-STTILECFG 1658
-STTILECFG_EVEX 1659
-STUI 1660
-ST_F 1661
-ST_FP 1662
-ST_FPrr 1663
-ST_Fp 1664
-ST_FpP 1665
-ST_Frr 1666
-SUB 1667
-SUBPDrm 1668
-SUBPDrr 1669
-SUBPSrm 1670
-SUBPSrr 1671
-SUBREG_TO_REG 1672
-SUBR_F 1673
-SUBR_FI 1674
-SUBR_FPrST 1675
-SUBR_FST 1676
-SUBR_Fp 1677
-SUBR_FpI 1678
-SUBR_FrST 1679
-SUBSDrm 1680
-SUBSDrm_Int 1681
-SUBSDrr 1682
-SUBSDrr_Int 1683
-SUBSSrm 1684
-SUBSSrm_Int 1685
-SUBSSrr 1686
-SUBSSrr_Int 1687
-SUB_F 1688
-SUB_FI 1689
-SUB_FPrST 1690
-SUB_FST 1691
-SUB_Fp 1692
-SUB_FpI 1693
-SUB_FrST 1694
-SWAPGS 1695
-SYSCALL 1696
-SYSENTER 1697
-SYSEXIT 1698
-SYSRET 1699
-T 1700
-TAILJMPd 1701
-TAILJMPd_CC 1702
-TAILJMPm 1703
-TAILJMPr 1704
-TCMMIMFP 1705
-TCMMRLFP 1706
-TCRETURN_HIPE 1707
-TCRETURN_WIN 1708
-TCRETURN_WINmi 1709
-TCRETURNdi 1710
-TCRETURNdicc 1711
-TCRETURNmi 1712
-TCRETURNri 1713
-TCVTROWD 1714
-TCVTROWPS 1715
-TDCALL 1716
-TDPBF 1717
-TDPBHF 1718
-TDPBSSD 1719
-TDPBSUD 1720
-TDPBUSD 1721
-TDPBUUD 1722
-TDPFP 1723
-TDPHBF 1724
-TDPHF 1725
-TEST 1726
-TESTUI 1727
-TILELOADD 1728
-TILELOADDRS 1729
-TILELOADDRST 1730
-TILELOADDRS_EVEX 1731
-TILELOADDT 1732
-TILELOADD_EVEX 1733
-TILEMOVROWrre 1734
-TILEMOVROWrri 1735
-TILERELEASE 1736
-TILESTORED 1737
-TILESTORED_EVEX 1738
-TILEZERO 1739
-TLBSYNC 1740
-TLSCall 1741
-TLS_addr 1742
-TLS_addrX 1743
-TLS_base_addr 1744
-TLS_base_addrX 1745
-TLS_desc 1746
-TMMULTF 1747
-TPAUSE 1748
-TRAP 1749
-TST_F 1750
-TST_Fp 1751
-TZCNT 1752
-TZMSK 1753
-UBSAN_UD 1754
-UCOMISDrm 1755
-UCOMISDrm_Int 1756
-UCOMISDrr 1757
-UCOMISDrr_Int 1758
-UCOMISSrm 1759
-UCOMISSrm_Int 1760
-UCOMISSrr 1761
-UCOMISSrr_Int 1762
-UCOM_FIPr 1763
-UCOM_FIr 1764
-UCOM_FPPr 1765
-UCOM_FPr 1766
-UCOM_FpIr 1767
-UCOM_Fpr 1768
-UCOM_Fr 1769
-UD 1770
-UIRET 1771
-UMONITOR 1772
-UMWAIT 1773
-UNPCKHPDrm 1774
-UNPCKHPDrr 1775
-UNPCKHPSrm 1776
-UNPCKHPSrr 1777
-UNPCKLPDrm 1778
-UNPCKLPDrr 1779
-UNPCKLPSrm 1780
-UNPCKLPSrr 1781
-URDMSRri 1782
-URDMSRri_EVEX 1783
-URDMSRrr 1784
-URDMSRrr_EVEX 1785
-UWRMSRir 1786
-UWRMSRir_EVEX 1787
-UWRMSRrr 1788
-UWRMSRrr_EVEX 1789
-V 1790
-VAARG 1791
-VAARG_X 1792
-VADDBF 1793
-VADDPDYrm 1794
-VADDPDYrr 1795
-VADDPDZ 1796
-VADDPDZrm 1797
-VADDPDZrmb 1798
-VADDPDZrmbk 1799
-VADDPDZrmbkz 1800
-VADDPDZrmk 1801
-VADDPDZrmkz 1802
-VADDPDZrr 1803
-VADDPDZrrb 1804
-VADDPDZrrbk 1805
-VADDPDZrrbkz 1806
-VADDPDZrrk 1807
-VADDPDZrrkz 1808
-VADDPDrm 1809
-VADDPDrr 1810
-VADDPHZ 1811
-VADDPHZrm 1812
-VADDPHZrmb 1813
-VADDPHZrmbk 1814
-VADDPHZrmbkz 1815
-VADDPHZrmk 1816
-VADDPHZrmkz 1817
-VADDPHZrr 1818
-VADDPHZrrb 1819
-VADDPHZrrbk 1820
-VADDPHZrrbkz 1821
-VADDPHZrrk 1822
-VADDPHZrrkz 1823
-VADDPSYrm 1824
-VADDPSYrr 1825
-VADDPSZ 1826
-VADDPSZrm 1827
-VADDPSZrmb 1828
-VADDPSZrmbk 1829
-VADDPSZrmbkz 1830
-VADDPSZrmk 1831
-VADDPSZrmkz 1832
-VADDPSZrr 1833
-VADDPSZrrb 1834
-VADDPSZrrbk 1835
-VADDPSZrrbkz 1836
-VADDPSZrrk 1837
-VADDPSZrrkz 1838
-VADDPSrm 1839
-VADDPSrr 1840
-VADDSDZrm 1841
-VADDSDZrm_Int 1842
-VADDSDZrmk_Int 1843
-VADDSDZrmkz_Int 1844
-VADDSDZrr 1845
-VADDSDZrr_Int 1846
-VADDSDZrrb_Int 1847
-VADDSDZrrbk_Int 1848
-VADDSDZrrbkz_Int 1849
-VADDSDZrrk_Int 1850
-VADDSDZrrkz_Int 1851
-VADDSDrm 1852
-VADDSDrm_Int 1853
-VADDSDrr 1854
-VADDSDrr_Int 1855
-VADDSHZrm 1856
-VADDSHZrm_Int 1857
-VADDSHZrmk_Int 1858
-VADDSHZrmkz_Int 1859
-VADDSHZrr 1860
-VADDSHZrr_Int 1861
-VADDSHZrrb_Int 1862
-VADDSHZrrbk_Int 1863
-VADDSHZrrbkz_Int 1864
-VADDSHZrrk_Int 1865
-VADDSHZrrkz_Int 1866
-VADDSSZrm 1867
-VADDSSZrm_Int 1868
-VADDSSZrmk_Int 1869
-VADDSSZrmkz_Int 1870
-VADDSSZrr 1871
-VADDSSZrr_Int 1872
-VADDSSZrrb_Int 1873
-VADDSSZrrbk_Int 1874
-VADDSSZrrbkz_Int 1875
-VADDSSZrrk_Int 1876
-VADDSSZrrkz_Int 1877
-VADDSSrm 1878
-VADDSSrm_Int 1879
-VADDSSrr 1880
-VADDSSrr_Int 1881
-VADDSUBPDYrm 1882
-VADDSUBPDYrr 1883
-VADDSUBPDrm 1884
-VADDSUBPDrr 1885
-VADDSUBPSYrm 1886
-VADDSUBPSYrr 1887
-VADDSUBPSrm 1888
-VADDSUBPSrr 1889
-VAESDECLASTYrm 1890
-VAESDECLASTYrr 1891
-VAESDECLASTZ 1892
-VAESDECLASTZrm 1893
-VAESDECLASTZrr 1894
-VAESDECLASTrm 1895
-VAESDECLASTrr 1896
-VAESDECYrm 1897
-VAESDECYrr 1898
-VAESDECZ 1899
-VAESDECZrm 1900
-VAESDECZrr 1901
-VAESDECrm 1902
-VAESDECrr 1903
-VAESENCLASTYrm 1904
-VAESENCLASTYrr 1905
-VAESENCLASTZ 1906
-VAESENCLASTZrm 1907
-VAESENCLASTZrr 1908
-VAESENCLASTrm 1909
-VAESENCLASTrr 1910
-VAESENCYrm 1911
-VAESENCYrr 1912
-VAESENCZ 1913
-VAESENCZrm 1914
-VAESENCZrr 1915
-VAESENCrm 1916
-VAESENCrr 1917
-VAESIMCrm 1918
-VAESIMCrr 1919
-VAESKEYGENASSISTrmi 1920
-VAESKEYGENASSISTrri 1921
-VALIGNDZ 1922
-VALIGNDZrmbi 1923
-VALIGNDZrmbik 1924
-VALIGNDZrmbikz 1925
-VALIGNDZrmi 1926
-VALIGNDZrmik 1927
-VALIGNDZrmikz 1928
-VALIGNDZrri 1929
-VALIGNDZrrik 1930
-VALIGNDZrrikz 1931
-VALIGNQZ 1932
-VALIGNQZrmbi 1933
-VALIGNQZrmbik 1934
-VALIGNQZrmbikz 1935
-VALIGNQZrmi 1936
-VALIGNQZrmik 1937
-VALIGNQZrmikz 1938
-VALIGNQZrri 1939
-VALIGNQZrrik 1940
-VALIGNQZrrikz 1941
-VANDNPDYrm 1942
-VANDNPDYrr 1943
-VANDNPDZ 1944
-VANDNPDZrm 1945
-VANDNPDZrmb 1946
-VANDNPDZrmbk 1947
-VANDNPDZrmbkz 1948
-VANDNPDZrmk 1949
-VANDNPDZrmkz 1950
-VANDNPDZrr 1951
-VANDNPDZrrk 1952
-VANDNPDZrrkz 1953
-VANDNPDrm 1954
-VANDNPDrr 1955
-VANDNPSYrm 1956
-VANDNPSYrr 1957
-VANDNPSZ 1958
-VANDNPSZrm 1959
-VANDNPSZrmb 1960
-VANDNPSZrmbk 1961
-VANDNPSZrmbkz 1962
-VANDNPSZrmk 1963
-VANDNPSZrmkz 1964
-VANDNPSZrr 1965
-VANDNPSZrrk 1966
-VANDNPSZrrkz 1967
-VANDNPSrm 1968
-VANDNPSrr 1969
-VANDPDYrm 1970
-VANDPDYrr 1971
-VANDPDZ 1972
-VANDPDZrm 1973
-VANDPDZrmb 1974
-VANDPDZrmbk 1975
-VANDPDZrmbkz 1976
-VANDPDZrmk 1977
-VANDPDZrmkz 1978
-VANDPDZrr 1979
-VANDPDZrrk 1980
-VANDPDZrrkz 1981
-VANDPDrm 1982
-VANDPDrr 1983
-VANDPSYrm 1984
-VANDPSYrr 1985
-VANDPSZ 1986
-VANDPSZrm 1987
-VANDPSZrmb 1988
-VANDPSZrmbk 1989
-VANDPSZrmbkz 1990
-VANDPSZrmk 1991
-VANDPSZrmkz 1992
-VANDPSZrr 1993
-VANDPSZrrk 1994
-VANDPSZrrkz 1995
-VANDPSrm 1996
-VANDPSrr 1997
-VASTART_SAVE_XMM_REGS 1998
-VBCSTNEBF 1999
-VBCSTNESH 2000
-VBLENDMPDZ 2001
-VBLENDMPDZrm 2002
-VBLENDMPDZrmb 2003
-VBLENDMPDZrmbk 2004
-VBLENDMPDZrmbkz 2005
-VBLENDMPDZrmk 2006
-VBLENDMPDZrmkz 2007
-VBLENDMPDZrr 2008
-VBLENDMPDZrrk 2009
-VBLENDMPDZrrkz 2010
-VBLENDMPSZ 2011
-VBLENDMPSZrm 2012
-VBLENDMPSZrmb 2013
-VBLENDMPSZrmbk 2014
-VBLENDMPSZrmbkz 2015
-VBLENDMPSZrmk 2016
-VBLENDMPSZrmkz 2017
-VBLENDMPSZrr 2018
-VBLENDMPSZrrk 2019
-VBLENDMPSZrrkz 2020
-VBLENDPDYrmi 2021
-VBLENDPDYrri 2022
-VBLENDPDrmi 2023
-VBLENDPDrri 2024
-VBLENDPSYrmi 2025
-VBLENDPSYrri 2026
-VBLENDPSrmi 2027
-VBLENDPSrri 2028
-VBLENDVPDYrmr 2029
-VBLENDVPDYrrr 2030
-VBLENDVPDrmr 2031
-VBLENDVPDrrr 2032
-VBLENDVPSYrmr 2033
-VBLENDVPSYrrr 2034
-VBLENDVPSrmr 2035
-VBLENDVPSrrr 2036
-VBROADCASTF 2037
-VBROADCASTI 2038
-VBROADCASTSDYrm 2039
-VBROADCASTSDYrr 2040
-VBROADCASTSDZ 2041
-VBROADCASTSDZrm 2042
-VBROADCASTSDZrmk 2043
-VBROADCASTSDZrmkz 2044
-VBROADCASTSDZrr 2045
-VBROADCASTSDZrrk 2046
-VBROADCASTSDZrrkz 2047
-VBROADCASTSSYrm 2048
-VBROADCASTSSYrr 2049
-VBROADCASTSSZ 2050
-VBROADCASTSSZrm 2051
-VBROADCASTSSZrmk 2052
-VBROADCASTSSZrmkz 2053
-VBROADCASTSSZrr 2054
-VBROADCASTSSZrrk 2055
-VBROADCASTSSZrrkz 2056
-VBROADCASTSSrm 2057
-VBROADCASTSSrr 2058
-VCMPBF 2059
-VCMPPDYrmi 2060
-VCMPPDYrri 2061
-VCMPPDZ 2062
-VCMPPDZrmbi 2063
-VCMPPDZrmbik 2064
-VCMPPDZrmi 2065
-VCMPPDZrmik 2066
-VCMPPDZrri 2067
-VCMPPDZrrib 2068
-VCMPPDZrribk 2069
-VCMPPDZrrik 2070
-VCMPPDrmi 2071
-VCMPPDrri 2072
-VCMPPHZ 2073
-VCMPPHZrmbi 2074
-VCMPPHZrmbik 2075
-VCMPPHZrmi 2076
-VCMPPHZrmik 2077
-VCMPPHZrri 2078
-VCMPPHZrrib 2079
-VCMPPHZrribk 2080
-VCMPPHZrrik 2081
-VCMPPSYrmi 2082
-VCMPPSYrri 2083
-VCMPPSZ 2084
-VCMPPSZrmbi 2085
-VCMPPSZrmbik 2086
-VCMPPSZrmi 2087
-VCMPPSZrmik 2088
-VCMPPSZrri 2089
-VCMPPSZrrib 2090
-VCMPPSZrribk 2091
-VCMPPSZrrik 2092
-VCMPPSrmi 2093
-VCMPPSrri 2094
-VCMPSDZrmi 2095
-VCMPSDZrmi_Int 2096
-VCMPSDZrmik_Int 2097
-VCMPSDZrri 2098
-VCMPSDZrri_Int 2099
-VCMPSDZrrib_Int 2100
-VCMPSDZrribk_Int 2101
-VCMPSDZrrik_Int 2102
-VCMPSDrmi 2103
-VCMPSDrmi_Int 2104
-VCMPSDrri 2105
-VCMPSDrri_Int 2106
-VCMPSHZrmi 2107
-VCMPSHZrmi_Int 2108
-VCMPSHZrmik_Int 2109
-VCMPSHZrri 2110
-VCMPSHZrri_Int 2111
-VCMPSHZrrib_Int 2112
-VCMPSHZrribk_Int 2113
-VCMPSHZrrik_Int 2114
-VCMPSSZrmi 2115
-VCMPSSZrmi_Int 2116
-VCMPSSZrmik_Int 2117
-VCMPSSZrri 2118
-VCMPSSZrri_Int 2119
-VCMPSSZrrib_Int 2120
-VCMPSSZrribk_Int 2121
-VCMPSSZrrik_Int 2122
-VCMPSSrmi 2123
-VCMPSSrmi_Int 2124
-VCMPSSrri 2125
-VCMPSSrri_Int 2126
-VCOMISBF 2127
-VCOMISDZrm 2128
-VCOMISDZrm_Int 2129
-VCOMISDZrr 2130
-VCOMISDZrr_Int 2131
-VCOMISDZrrb 2132
-VCOMISDrm 2133
-VCOMISDrm_Int 2134
-VCOMISDrr 2135
-VCOMISDrr_Int 2136
-VCOMISHZrm 2137
-VCOMISHZrm_Int 2138
-VCOMISHZrr 2139
-VCOMISHZrr_Int 2140
-VCOMISHZrrb 2141
-VCOMISSZrm 2142
-VCOMISSZrm_Int 2143
-VCOMISSZrr 2144
-VCOMISSZrr_Int 2145
-VCOMISSZrrb 2146
-VCOMISSrm 2147
-VCOMISSrm_Int 2148
-VCOMISSrr 2149
-VCOMISSrr_Int 2150
-VCOMPRESSPDZ 2151
-VCOMPRESSPDZmr 2152
-VCOMPRESSPDZmrk 2153
-VCOMPRESSPDZrr 2154
-VCOMPRESSPDZrrk 2155
-VCOMPRESSPDZrrkz 2156
-VCOMPRESSPSZ 2157
-VCOMPRESSPSZmr 2158
-VCOMPRESSPSZmrk 2159
-VCOMPRESSPSZrr 2160
-VCOMPRESSPSZrrk 2161
-VCOMPRESSPSZrrkz 2162
-VCOMXSDZrm_Int 2163
-VCOMXSDZrr_Int 2164
-VCOMXSDZrrb_Int 2165
-VCOMXSHZrm_Int 2166
-VCOMXSHZrr_Int 2167
-VCOMXSHZrrb_Int 2168
-VCOMXSSZrm_Int 2169
-VCOMXSSZrr_Int 2170
-VCOMXSSZrrb_Int 2171
-VCVT 2172
-VCVTBF 2173
-VCVTBIASPH 2174
-VCVTDQ 2175
-VCVTHF 2176
-VCVTNE 2177
-VCVTNEEBF 2178
-VCVTNEEPH 2179
-VCVTNEOBF 2180
-VCVTNEOPH 2181
-VCVTNEPS 2182
-VCVTPD 2183
-VCVTPH 2184
-VCVTPS 2185
-VCVTQQ 2186
-VCVTSD 2187
-VCVTSH 2188
-VCVTSI 2189
-VCVTSS 2190
-VCVTTBF 2191
-VCVTTPD 2192
-VCVTTPH 2193
-VCVTTPS 2194
-VCVTTSD 2195
-VCVTTSH 2196
-VCVTTSS 2197
-VCVTUDQ 2198
-VCVTUQQ 2199
-VCVTUSI 2200
-VCVTUW 2201
-VCVTW 2202
-VDBPSADBWZ 2203
-VDBPSADBWZrmi 2204
-VDBPSADBWZrmik 2205
-VDBPSADBWZrmikz 2206
-VDBPSADBWZrri 2207
-VDBPSADBWZrrik 2208
-VDBPSADBWZrrikz 2209
-VDIVBF 2210
-VDIVPDYrm 2211
-VDIVPDYrr 2212
-VDIVPDZ 2213
-VDIVPDZrm 2214
-VDIVPDZrmb 2215
-VDIVPDZrmbk 2216
-VDIVPDZrmbkz 2217
-VDIVPDZrmk 2218
-VDIVPDZrmkz 2219
-VDIVPDZrr 2220
-VDIVPDZrrb 2221
-VDIVPDZrrbk 2222
-VDIVPDZrrbkz 2223
-VDIVPDZrrk 2224
-VDIVPDZrrkz 2225
-VDIVPDrm 2226
-VDIVPDrr 2227
-VDIVPHZ 2228
-VDIVPHZrm 2229
-VDIVPHZrmb 2230
-VDIVPHZrmbk 2231
-VDIVPHZrmbkz 2232
-VDIVPHZrmk 2233
-VDIVPHZrmkz 2234
-VDIVPHZrr 2235
-VDIVPHZrrb 2236
-VDIVPHZrrbk 2237
-VDIVPHZrrbkz 2238
-VDIVPHZrrk 2239
-VDIVPHZrrkz 2240
-VDIVPSYrm 2241
-VDIVPSYrr 2242
-VDIVPSZ 2243
-VDIVPSZrm 2244
-VDIVPSZrmb 2245
-VDIVPSZrmbk 2246
-VDIVPSZrmbkz 2247
-VDIVPSZrmk 2248
-VDIVPSZrmkz 2249
-VDIVPSZrr 2250
-VDIVPSZrrb 2251
-VDIVPSZrrbk 2252
-VDIVPSZrrbkz 2253
-VDIVPSZrrk 2254
-VDIVPSZrrkz 2255
-VDIVPSrm 2256
-VDIVPSrr 2257
-VDIVSDZrm 2258
-VDIVSDZrm_Int 2259
-VDIVSDZrmk_Int 2260
-VDIVSDZrmkz_Int 2261
-VDIVSDZrr 2262
-VDIVSDZrr_Int 2263
-VDIVSDZrrb_Int 2264
-VDIVSDZrrbk_Int 2265
-VDIVSDZrrbkz_Int 2266
-VDIVSDZrrk_Int 2267
-VDIVSDZrrkz_Int 2268
-VDIVSDrm 2269
-VDIVSDrm_Int 2270
-VDIVSDrr 2271
-VDIVSDrr_Int 2272
-VDIVSHZrm 2273
-VDIVSHZrm_Int 2274
-VDIVSHZrmk_Int 2275
-VDIVSHZrmkz_Int 2276
-VDIVSHZrr 2277
-VDIVSHZrr_Int 2278
-VDIVSHZrrb_Int 2279
-VDIVSHZrrbk_Int 2280
-VDIVSHZrrbkz_Int 2281
-VDIVSHZrrk_Int 2282
-VDIVSHZrrkz_Int 2283
-VDIVSSZrm 2284
-VDIVSSZrm_Int 2285
-VDIVSSZrmk_Int 2286
-VDIVSSZrmkz_Int 2287
-VDIVSSZrr 2288
-VDIVSSZrr_Int 2289
-VDIVSSZrrb_Int 2290
-VDIVSSZrrbk_Int 2291
-VDIVSSZrrbkz_Int 2292
-VDIVSSZrrk_Int 2293
-VDIVSSZrrkz_Int 2294
-VDIVSSrm 2295
-VDIVSSrm_Int 2296
-VDIVSSrr 2297
-VDIVSSrr_Int 2298
-VDPBF 2299
-VDPPDrmi 2300
-VDPPDrri 2301
-VDPPHPSZ 2302
-VDPPHPSZm 2303
-VDPPHPSZmb 2304
-VDPPHPSZmbk 2305
-VDPPHPSZmbkz 2306
-VDPPHPSZmk 2307
-VDPPHPSZmkz 2308
-VDPPHPSZr 2309
-VDPPHPSZrk 2310
-VDPPHPSZrkz 2311
-VDPPSYrmi 2312
-VDPPSYrri 2313
-VDPPSrmi 2314
-VDPPSrri 2315
-VERRm 2316
-VERRr 2317
-VERWm 2318
-VERWr 2319
-VEXP 2320
-VEXPANDPDZ 2321
-VEXPANDPDZrm 2322
-VEXPANDPDZrmk 2323
-VEXPANDPDZrmkz 2324
-VEXPANDPDZrr 2325
-VEXPANDPDZrrk 2326
-VEXPANDPDZrrkz 2327
-VEXPANDPSZ 2328
-VEXPANDPSZrm 2329
-VEXPANDPSZrmk 2330
-VEXPANDPSZrmkz 2331
-VEXPANDPSZrr 2332
-VEXPANDPSZrrk 2333
-VEXPANDPSZrrkz 2334
-VEXTRACTF 2335
-VEXTRACTI 2336
-VEXTRACTPSZmri 2337
-VEXTRACTPSZrri 2338
-VEXTRACTPSmri 2339
-VEXTRACTPSrri 2340
-VFCMADDCPHZ 2341
-VFCMADDCPHZm 2342
-VFCMADDCPHZmb 2343
-VFCMADDCPHZmbk 2344
-VFCMADDCPHZmbkz 2345
-VFCMADDCPHZmk 2346
-VFCMADDCPHZmkz 2347
-VFCMADDCPHZr 2348
-VFCMADDCPHZrb 2349
-VFCMADDCPHZrbk 2350
-VFCMADDCPHZrbkz 2351
-VFCMADDCPHZrk 2352
-VFCMADDCPHZrkz 2353
-VFCMADDCSHZm 2354
-VFCMADDCSHZmk 2355
-VFCMADDCSHZmkz 2356
-VFCMADDCSHZr 2357
-VFCMADDCSHZrb 2358
-VFCMADDCSHZrbk 2359
-VFCMADDCSHZrbkz 2360
-VFCMADDCSHZrk 2361
-VFCMADDCSHZrkz 2362
-VFCMULCPHZ 2363
-VFCMULCPHZrm 2364
-VFCMULCPHZrmb 2365
-VFCMULCPHZrmbk 2366
-VFCMULCPHZrmbkz 2367
-VFCMULCPHZrmk 2368
-VFCMULCPHZrmkz 2369
-VFCMULCPHZrr 2370
-VFCMULCPHZrrb 2371
-VFCMULCPHZrrbk 2372
-VFCMULCPHZrrbkz 2373
-VFCMULCPHZrrk 2374
-VFCMULCPHZrrkz 2375
-VFCMULCSHZrm 2376
-VFCMULCSHZrmk 2377
-VFCMULCSHZrmkz 2378
-VFCMULCSHZrr 2379
-VFCMULCSHZrrb 2380
-VFCMULCSHZrrbk 2381
-VFCMULCSHZrrbkz 2382
-VFCMULCSHZrrk 2383
-VFCMULCSHZrrkz 2384
-VFIXUPIMMPDZ 2385
-VFIXUPIMMPDZrmbi 2386
-VFIXUPIMMPDZrmbik 2387
-VFIXUPIMMPDZrmbikz 2388
-VFIXUPIMMPDZrmi 2389
-VFIXUPIMMPDZrmik 2390
-VFIXUPIMMPDZrmikz 2391
-VFIXUPIMMPDZrri 2392
-VFIXUPIMMPDZrrib 2393
-VFIXUPIMMPDZrribk 2394
-VFIXUPIMMPDZrribkz 2395
-VFIXUPIMMPDZrrik 2396
-VFIXUPIMMPDZrrikz 2397
-VFIXUPIMMPSZ 2398
-VFIXUPIMMPSZrmbi 2399
-VFIXUPIMMPSZrmbik 2400
-VFIXUPIMMPSZrmbikz 2401
-VFIXUPIMMPSZrmi 2402
-VFIXUPIMMPSZrmik 2403
-VFIXUPIMMPSZrmikz 2404
-VFIXUPIMMPSZrri 2405
-VFIXUPIMMPSZrrib 2406
-VFIXUPIMMPSZrribk 2407
-VFIXUPIMMPSZrribkz 2408
-VFIXUPIMMPSZrrik 2409
-VFIXUPIMMPSZrrikz 2410
-VFIXUPIMMSDZrmi 2411
-VFIXUPIMMSDZrmik 2412
-VFIXUPIMMSDZrmikz 2413
-VFIXUPIMMSDZrri 2414
-VFIXUPIMMSDZrrib 2415
-VFIXUPIMMSDZrribk 2416
-VFIXUPIMMSDZrribkz 2417
-VFIXUPIMMSDZrrik 2418
-VFIXUPIMMSDZrrikz 2419
-VFIXUPIMMSSZrmi 2420
-VFIXUPIMMSSZrmik 2421
-VFIXUPIMMSSZrmikz 2422
-VFIXUPIMMSSZrri 2423
-VFIXUPIMMSSZrrib 2424
-VFIXUPIMMSSZrribk 2425
-VFIXUPIMMSSZrribkz 2426
-VFIXUPIMMSSZrrik 2427
-VFIXUPIMMSSZrrikz 2428
-VFMADD 2429
-VFMADDCPHZ 2430
-VFMADDCPHZm 2431
-VFMADDCPHZmb 2432
-VFMADDCPHZmbk 2433
-VFMADDCPHZmbkz 2434
-VFMADDCPHZmk 2435
-VFMADDCPHZmkz 2436
-VFMADDCPHZr 2437
-VFMADDCPHZrb 2438
-VFMADDCPHZrbk 2439
-VFMADDCPHZrbkz 2440
-VFMADDCPHZrk 2441
-VFMADDCPHZrkz 2442
-VFMADDCSHZm 2443
-VFMADDCSHZmk 2444
-VFMADDCSHZmkz 2445
-VFMADDCSHZr 2446
-VFMADDCSHZrb 2447
-VFMADDCSHZrbk 2448
-VFMADDCSHZrbkz 2449
-VFMADDCSHZrk 2450
-VFMADDCSHZrkz 2451
-VFMADDPD 2452
-VFMADDPS 2453
-VFMADDSD 2454
-VFMADDSS 2455
-VFMADDSUB 2456
-VFMADDSUBPD 2457
-VFMADDSUBPS 2458
-VFMSUB 2459
-VFMSUBADD 2460
-VFMSUBADDPD 2461
-VFMSUBADDPS 2462
-VFMSUBPD 2463
-VFMSUBPS 2464
-VFMSUBSD 2465
-VFMSUBSS 2466
-VFMULCPHZ 2467
-VFMULCPHZrm 2468
-VFMULCPHZrmb 2469
-VFMULCPHZrmbk 2470
-VFMULCPHZrmbkz 2471
-VFMULCPHZrmk 2472
-VFMULCPHZrmkz 2473
-VFMULCPHZrr 2474
-VFMULCPHZrrb 2475
-VFMULCPHZrrbk 2476
-VFMULCPHZrrbkz 2477
-VFMULCPHZrrk 2478
-VFMULCPHZrrkz 2479
-VFMULCSHZrm 2480
-VFMULCSHZrmk 2481
-VFMULCSHZrmkz 2482
-VFMULCSHZrr 2483
-VFMULCSHZrrb 2484
-VFMULCSHZrrbk 2485
-VFMULCSHZrrbkz 2486
-VFMULCSHZrrk 2487
-VFMULCSHZrrkz 2488
-VFNMADD 2489
-VFNMADDPD 2490
-VFNMADDPS 2491
-VFNMADDSD 2492
-VFNMADDSS 2493
-VFNMSUB 2494
-VFNMSUBPD 2495
-VFNMSUBPS 2496
-VFNMSUBSD 2497
-VFNMSUBSS 2498
-VFPCLASSBF 2499
-VFPCLASSPDZ 2500
-VFPCLASSPDZmbi 2501
-VFPCLASSPDZmbik 2502
-VFPCLASSPDZmi 2503
-VFPCLASSPDZmik 2504
-VFPCLASSPDZri 2505
-VFPCLASSPDZrik 2506
-VFPCLASSPHZ 2507
-VFPCLASSPHZmbi 2508
-VFPCLASSPHZmbik 2509
-VFPCLASSPHZmi 2510
-VFPCLASSPHZmik 2511
-VFPCLASSPHZri 2512
-VFPCLASSPHZrik 2513
-VFPCLASSPSZ 2514
-VFPCLASSPSZmbi 2515
-VFPCLASSPSZmbik 2516
-VFPCLASSPSZmi 2517
-VFPCLASSPSZmik 2518
-VFPCLASSPSZri 2519
-VFPCLASSPSZrik 2520
-VFPCLASSSDZmi 2521
-VFPCLASSSDZmik 2522
-VFPCLASSSDZri 2523
-VFPCLASSSDZrik 2524
-VFPCLASSSHZmi 2525
-VFPCLASSSHZmik 2526
-VFPCLASSSHZri 2527
-VFPCLASSSHZrik 2528
-VFPCLASSSSZmi 2529
-VFPCLASSSSZmik 2530
-VFPCLASSSSZri 2531
-VFPCLASSSSZrik 2532
-VFRCZPDYrm 2533
-VFRCZPDYrr 2534
-VFRCZPDrm 2535
-VFRCZPDrr 2536
-VFRCZPSYrm 2537
-VFRCZPSYrr 2538
-VFRCZPSrm 2539
-VFRCZPSrr 2540
-VFRCZSDrm 2541
-VFRCZSDrr 2542
-VFRCZSSrm 2543
-VFRCZSSrr 2544
-VGATHERDPDYrm 2545
-VGATHERDPDZ 2546
-VGATHERDPDZrm 2547
-VGATHERDPDrm 2548
-VGATHERDPSYrm 2549
-VGATHERDPSZ 2550
-VGATHERDPSZrm 2551
-VGATHERDPSrm 2552
-VGATHERPF 2553
-VGATHERQPDYrm 2554
-VGATHERQPDZ 2555
-VGATHERQPDZrm 2556
-VGATHERQPDrm 2557
-VGATHERQPSYrm 2558
-VGATHERQPSZ 2559
-VGATHERQPSZrm 2560
-VGATHERQPSrm 2561
-VGETEXPBF 2562
-VGETEXPPDZ 2563
-VGETEXPPDZm 2564
-VGETEXPPDZmb 2565
-VGETEXPPDZmbk 2566
-VGETEXPPDZmbkz 2567
-VGETEXPPDZmk 2568
-VGETEXPPDZmkz 2569
-VGETEXPPDZr 2570
-VGETEXPPDZrb 2571
-VGETEXPPDZrbk 2572
-VGETEXPPDZrbkz 2573
-VGETEXPPDZrk 2574
-VGETEXPPDZrkz 2575
-VGETEXPPHZ 2576
-VGETEXPPHZm 2577
-VGETEXPPHZmb 2578
-VGETEXPPHZmbk 2579
-VGETEXPPHZmbkz 2580
-VGETEXPPHZmk 2581
-VGETEXPPHZmkz 2582
-VGETEXPPHZr 2583
-VGETEXPPHZrb 2584
-VGETEXPPHZrbk 2585
-VGETEXPPHZrbkz 2586
-VGETEXPPHZrk 2587
-VGETEXPPHZrkz 2588
-VGETEXPPSZ 2589
-VGETEXPPSZm 2590
-VGETEXPPSZmb 2591
-VGETEXPPSZmbk 2592
-VGETEXPPSZmbkz 2593
-VGETEXPPSZmk 2594
-VGETEXPPSZmkz 2595
-VGETEXPPSZr 2596
-VGETEXPPSZrb 2597
-VGETEXPPSZrbk 2598
-VGETEXPPSZrbkz 2599
-VGETEXPPSZrk 2600
-VGETEXPPSZrkz 2601
-VGETEXPSDZm 2602
-VGETEXPSDZmk 2603
-VGETEXPSDZmkz 2604
-VGETEXPSDZr 2605
-VGETEXPSDZrb 2606
-VGETEXPSDZrbk 2607
-VGETEXPSDZrbkz 2608
-VGETEXPSDZrk 2609
-VGETEXPSDZrkz 2610
-VGETEXPSHZm 2611
-VGETEXPSHZmk 2612
-VGETEXPSHZmkz 2613
-VGETEXPSHZr 2614
-VGETEXPSHZrb 2615
-VGETEXPSHZrbk 2616
-VGETEXPSHZrbkz 2617
-VGETEXPSHZrk 2618
-VGETEXPSHZrkz 2619
-VGETEXPSSZm 2620
-VGETEXPSSZmk 2621
-VGETEXPSSZmkz 2622
-VGETEXPSSZr 2623
-VGETEXPSSZrb 2624
-VGETEXPSSZrbk 2625
-VGETEXPSSZrbkz 2626
-VGETEXPSSZrk 2627
-VGETEXPSSZrkz 2628
-VGETMANTBF 2629
-VGETMANTPDZ 2630
-VGETMANTPDZrmbi 2631
-VGETMANTPDZrmbik 2632
-VGETMANTPDZrmbikz 2633
-VGETMANTPDZrmi 2634
-VGETMANTPDZrmik 2635
-VGETMANTPDZrmikz 2636
-VGETMANTPDZrri 2637
-VGETMANTPDZrrib 2638
-VGETMANTPDZrribk 2639
-VGETMANTPDZrribkz 2640
-VGETMANTPDZrrik 2641
-VGETMANTPDZrrikz 2642
-VGETMANTPHZ 2643
-VGETMANTPHZrmbi 2644
-VGETMANTPHZrmbik 2645
-VGETMANTPHZrmbikz 2646
-VGETMANTPHZrmi 2647
-VGETMANTPHZrmik 2648
-VGETMANTPHZrmikz 2649
-VGETMANTPHZrri 2650
-VGETMANTPHZrrib 2651
-VGETMANTPHZrribk 2652
-VGETMANTPHZrribkz 2653
-VGETMANTPHZrrik 2654
-VGETMANTPHZrrikz 2655
-VGETMANTPSZ 2656
-VGETMANTPSZrmbi 2657
-VGETMANTPSZrmbik 2658
-VGETMANTPSZrmbikz 2659
-VGETMANTPSZrmi 2660
-VGETMANTPSZrmik 2661
-VGETMANTPSZrmikz 2662
-VGETMANTPSZrri 2663
-VGETMANTPSZrrib 2664
-VGETMANTPSZrribk 2665
-VGETMANTPSZrribkz 2666
-VGETMANTPSZrrik 2667
-VGETMANTPSZrrikz 2668
-VGETMANTSDZrmi 2669
-VGETMANTSDZrmik 2670
-VGETMANTSDZrmikz 2671
-VGETMANTSDZrri 2672
-VGETMANTSDZrrib 2673
-VGETMANTSDZrribk 2674
-VGETMANTSDZrribkz 2675
-VGETMANTSDZrrik 2676
-VGETMANTSDZrrikz 2677
-VGETMANTSHZrmi 2678
-VGETMANTSHZrmik 2679
-VGETMANTSHZrmikz 2680
-VGETMANTSHZrri 2681
-VGETMANTSHZrrib 2682
-VGETMANTSHZrribk 2683
-VGETMANTSHZrribkz 2684
-VGETMANTSHZrrik 2685
-VGETMANTSHZrrikz 2686
-VGETMANTSSZrmi 2687
-VGETMANTSSZrmik 2688
-VGETMANTSSZrmikz 2689
-VGETMANTSSZrri 2690
-VGETMANTSSZrrib 2691
-VGETMANTSSZrribk 2692
-VGETMANTSSZrribkz 2693
-VGETMANTSSZrrik 2694
-VGETMANTSSZrrikz 2695
-VGF 2696
-VHADDPDYrm 2697
-VHADDPDYrr 2698
-VHADDPDrm 2699
-VHADDPDrr 2700
-VHADDPSYrm 2701
-VHADDPSYrr 2702
-VHADDPSrm 2703
-VHADDPSrr 2704
-VHSUBPDYrm 2705
-VHSUBPDYrr 2706
-VHSUBPDrm 2707
-VHSUBPDrr 2708
-VHSUBPSYrm 2709
-VHSUBPSYrr 2710
-VHSUBPSrm 2711
-VHSUBPSrr 2712
-VINSERTF 2713
-VINSERTI 2714
-VINSERTPSZrmi 2715
-VINSERTPSZrri 2716
-VINSERTPSrmi 2717
-VINSERTPSrri 2718
-VLDDQUYrm 2719
-VLDDQUrm 2720
-VLDMXCSR 2721
-VMASKMOVDQU 2722
-VMASKMOVPDYmr 2723
-VMASKMOVPDYrm 2724
-VMASKMOVPDmr 2725
-VMASKMOVPDrm 2726
-VMASKMOVPSYmr 2727
-VMASKMOVPSYrm 2728
-VMASKMOVPSmr 2729
-VMASKMOVPSrm 2730
-VMAXBF 2731
-VMAXCPDYrm 2732
-VMAXCPDYrr 2733
-VMAXCPDZ 2734
-VMAXCPDZrm 2735
-VMAXCPDZrmb 2736
-VMAXCPDZrmbk 2737
-VMAXCPDZrmbkz 2738
-VMAXCPDZrmk 2739
-VMAXCPDZrmkz 2740
-VMAXCPDZrr 2741
-VMAXCPDZrrk 2742
-VMAXCPDZrrkz 2743
-VMAXCPDrm 2744
-VMAXCPDrr 2745
-VMAXCPHZ 2746
-VMAXCPHZrm 2747
-VMAXCPHZrmb 2748
-VMAXCPHZrmbk 2749
-VMAXCPHZrmbkz 2750
-VMAXCPHZrmk 2751
-VMAXCPHZrmkz 2752
-VMAXCPHZrr 2753
-VMAXCPHZrrk 2754
-VMAXCPHZrrkz 2755
-VMAXCPSYrm 2756
-VMAXCPSYrr 2757
-VMAXCPSZ 2758
-VMAXCPSZrm 2759
-VMAXCPSZrmb 2760
-VMAXCPSZrmbk 2761
-VMAXCPSZrmbkz 2762
-VMAXCPSZrmk 2763
-VMAXCPSZrmkz 2764
-VMAXCPSZrr 2765
-VMAXCPSZrrk 2766
-VMAXCPSZrrkz 2767
-VMAXCPSrm 2768
-VMAXCPSrr 2769
-VMAXCSDZrm 2770
-VMAXCSDZrr 2771
-VMAXCSDrm 2772
-VMAXCSDrr 2773
-VMAXCSHZrm 2774
-VMAXCSHZrr 2775
-VMAXCSSZrm 2776
-VMAXCSSZrr 2777
-VMAXCSSrm 2778
-VMAXCSSrr 2779
-VMAXPDYrm 2780
-VMAXPDYrr 2781
-VMAXPDZ 2782
-VMAXPDZrm 2783
-VMAXPDZrmb 2784
-VMAXPDZrmbk 2785
-VMAXPDZrmbkz 2786
-VMAXPDZrmk 2787
-VMAXPDZrmkz 2788
-VMAXPDZrr 2789
-VMAXPDZrrb 2790
-VMAXPDZrrbk 2791
-VMAXPDZrrbkz 2792
-VMAXPDZrrk 2793
-VMAXPDZrrkz 2794
-VMAXPDrm 2795
-VMAXPDrr 2796
-VMAXPHZ 2797
-VMAXPHZrm 2798
-VMAXPHZrmb 2799
-VMAXPHZrmbk 2800
-VMAXPHZrmbkz 2801
-VMAXPHZrmk 2802
-VMAXPHZrmkz 2803
-VMAXPHZrr 2804
-VMAXPHZrrb 2805
-VMAXPHZrrbk 2806
-VMAXPHZrrbkz 2807
-VMAXPHZrrk 2808
-VMAXPHZrrkz 2809
-VMAXPSYrm 2810
-VMAXPSYrr 2811
-VMAXPSZ 2812
-VMAXPSZrm 2813
-VMAXPSZrmb 2814
-VMAXPSZrmbk 2815
-VMAXPSZrmbkz 2816
-VMAXPSZrmk 2817
-VMAXPSZrmkz 2818
-VMAXPSZrr 2819
-VMAXPSZrrb 2820
-VMAXPSZrrbk 2821
-VMAXPSZrrbkz 2822
-VMAXPSZrrk 2823
-VMAXPSZrrkz 2824
-VMAXPSrm 2825
-VMAXPSrr 2826
-VMAXSDZrm 2827
-VMAXSDZrm_Int 2828
-VMAXSDZrmk_Int 2829
-VMAXSDZrmkz_Int 2830
-VMAXSDZrr 2831
-VMAXSDZrr_Int 2832
-VMAXSDZrrb_Int 2833
-VMAXSDZrrbk_Int 2834
-VMAXSDZrrbkz_Int 2835
-VMAXSDZrrk_Int 2836
-VMAXSDZrrkz_Int 2837
-VMAXSDrm 2838
-VMAXSDrm_Int 2839
-VMAXSDrr 2840
-VMAXSDrr_Int 2841
-VMAXSHZrm 2842
-VMAXSHZrm_Int 2843
-VMAXSHZrmk_Int 2844
-VMAXSHZrmkz_Int 2845
-VMAXSHZrr 2846
-VMAXSHZrr_Int 2847
-VMAXSHZrrb_Int 2848
-VMAXSHZrrbk_Int 2849
-VMAXSHZrrbkz_Int 2850
-VMAXSHZrrk_Int 2851
-VMAXSHZrrkz_Int 2852
-VMAXSSZrm 2853
-VMAXSSZrm_Int 2854
-VMAXSSZrmk_Int 2855
-VMAXSSZrmkz_Int 2856
-VMAXSSZrr 2857
-VMAXSSZrr_Int 2858
-VMAXSSZrrb_Int 2859
-VMAXSSZrrbk_Int 2860
-VMAXSSZrrbkz_Int 2861
-VMAXSSZrrk_Int 2862
-VMAXSSZrrkz_Int 2863
-VMAXSSrm 2864
-VMAXSSrm_Int 2865
-VMAXSSrr 2866
-VMAXSSrr_Int 2867
-VMCALL 2868
-VMCLEARm 2869
-VMFUNC 2870
-VMINBF 2871
-VMINCPDYrm 2872
-VMINCPDYrr 2873
-VMINCPDZ 2874
-VMINCPDZrm 2875
-VMINCPDZrmb 2876
-VMINCPDZrmbk 2877
-VMINCPDZrmbkz 2878
-VMINCPDZrmk 2879
-VMINCPDZrmkz 2880
-VMINCPDZrr 2881
-VMINCPDZrrk 2882
-VMINCPDZrrkz 2883
-VMINCPDrm 2884
-VMINCPDrr 2885
-VMINCPHZ 2886
-VMINCPHZrm 2887
-VMINCPHZrmb 2888
-VMINCPHZrmbk 2889
-VMINCPHZrmbkz 2890
-VMINCPHZrmk 2891
-VMINCPHZrmkz 2892
-VMINCPHZrr 2893
-VMINCPHZrrk 2894
-VMINCPHZrrkz 2895
-VMINCPSYrm 2896
-VMINCPSYrr 2897
-VMINCPSZ 2898
-VMINCPSZrm 2899
-VMINCPSZrmb 2900
-VMINCPSZrmbk 2901
-VMINCPSZrmbkz 2902
-VMINCPSZrmk 2903
-VMINCPSZrmkz 2904
-VMINCPSZrr 2905
-VMINCPSZrrk 2906
-VMINCPSZrrkz 2907
-VMINCPSrm 2908
-VMINCPSrr 2909
-VMINCSDZrm 2910
-VMINCSDZrr 2911
-VMINCSDrm 2912
-VMINCSDrr 2913
-VMINCSHZrm 2914
-VMINCSHZrr 2915
-VMINCSSZrm 2916
-VMINCSSZrr 2917
-VMINCSSrm 2918
-VMINCSSrr 2919
-VMINMAXBF 2920
-VMINMAXPDZ 2921
-VMINMAXPDZrmbi 2922
-VMINMAXPDZrmbik 2923
-VMINMAXPDZrmbikz 2924
-VMINMAXPDZrmi 2925
-VMINMAXPDZrmik 2926
-VMINMAXPDZrmikz 2927
-VMINMAXPDZrri 2928
-VMINMAXPDZrrib 2929
-VMINMAXPDZrribk 2930
-VMINMAXPDZrribkz 2931
-VMINMAXPDZrrik 2932
-VMINMAXPDZrrikz 2933
-VMINMAXPHZ 2934
-VMINMAXPHZrmbi 2935
-VMINMAXPHZrmbik 2936
-VMINMAXPHZrmbikz 2937
-VMINMAXPHZrmi 2938
-VMINMAXPHZrmik 2939
-VMINMAXPHZrmikz 2940
-VMINMAXPHZrri 2941
-VMINMAXPHZrrib 2942
-VMINMAXPHZrribk 2943
-VMINMAXPHZrribkz 2944
-VMINMAXPHZrrik 2945
-VMINMAXPHZrrikz 2946
-VMINMAXPSZ 2947
-VMINMAXPSZrmbi 2948
-VMINMAXPSZrmbik 2949
-VMINMAXPSZrmbikz 2950
-VMINMAXPSZrmi 2951
-VMINMAXPSZrmik 2952
-VMINMAXPSZrmikz 2953
-VMINMAXPSZrri 2954
-VMINMAXPSZrrib 2955
-VMINMAXPSZrribk 2956
-VMINMAXPSZrribkz 2957
-VMINMAXPSZrrik 2958
-VMINMAXPSZrrikz 2959
-VMINMAXSDrmi 2960
-VMINMAXSDrmi_Int 2961
-VMINMAXSDrmik_Int 2962
-VMINMAXSDrmikz_Int 2963
-VMINMAXSDrri 2964
-VMINMAXSDrri_Int 2965
-VMINMAXSDrrib_Int 2966
-VMINMAXSDrribk_Int 2967
-VMINMAXSDrribkz_Int 2968
-VMINMAXSDrrik_Int 2969
-VMINMAXSDrrikz_Int 2970
-VMINMAXSHrmi 2971
-VMINMAXSHrmi_Int 2972
-VMINMAXSHrmik_Int 2973
-VMINMAXSHrmikz_Int 2974
-VMINMAXSHrri 2975
-VMINMAXSHrri_Int 2976
-VMINMAXSHrrib_Int 2977
-VMINMAXSHrribk_Int 2978
-VMINMAXSHrribkz_Int 2979
-VMINMAXSHrrik_Int 2980
-VMINMAXSHrrikz_Int 2981
-VMINMAXSSrmi 2982
-VMINMAXSSrmi_Int 2983
-VMINMAXSSrmik_Int 2984
-VMINMAXSSrmikz_Int 2985
-VMINMAXSSrri 2986
-VMINMAXSSrri_Int 2987
-VMINMAXSSrrib_Int 2988
-VMINMAXSSrribk_Int 2989
-VMINMAXSSrribkz_Int 2990
-VMINMAXSSrrik_Int 2991
-VMINMAXSSrrikz_Int 2992
-VMINPDYrm 2993
-VMINPDYrr 2994
-VMINPDZ 2995
-VMINPDZrm 2996
-VMINPDZrmb 2997
-VMINPDZrmbk 2998
-VMINPDZrmbkz 2999
-VMINPDZrmk 3000
-VMINPDZrmkz 3001
-VMINPDZrr 3002
-VMINPDZrrb 3003
-VMINPDZrrbk 3004
-VMINPDZrrbkz 3005
-VMINPDZrrk 3006
-VMINPDZrrkz 3007
-VMINPDrm 3008
-VMINPDrr 3009
-VMINPHZ 3010
-VMINPHZrm 3011
-VMINPHZrmb 3012
-VMINPHZrmbk 3013
-VMINPHZrmbkz 3014
-VMINPHZrmk 3015
-VMINPHZrmkz 3016
-VMINPHZrr 3017
-VMINPHZrrb 3018
-VMINPHZrrbk 3019
-VMINPHZrrbkz 3020
-VMINPHZrrk 3021
-VMINPHZrrkz 3022
-VMINPSYrm 3023
-VMINPSYrr 3024
-VMINPSZ 3025
-VMINPSZrm 3026
-VMINPSZrmb 3027
-VMINPSZrmbk 3028
-VMINPSZrmbkz 3029
-VMINPSZrmk 3030
-VMINPSZrmkz 3031
-VMINPSZrr 3032
-VMINPSZrrb 3033
-VMINPSZrrbk 3034
-VMINPSZrrbkz 3035
-VMINPSZrrk 3036
-VMINPSZrrkz 3037
-VMINPSrm 3038
-VMINPSrr 3039
-VMINSDZrm 3040
-VMINSDZrm_Int 3041
-VMINSDZrmk_Int 3042
-VMINSDZrmkz_Int 3043
-VMINSDZrr 3044
-VMINSDZrr_Int 3045
-VMINSDZrrb_Int 3046
-VMINSDZrrbk_Int 3047
-VMINSDZrrbkz_Int 3048
-VMINSDZrrk_Int 3049
-VMINSDZrrkz_Int 3050
-VMINSDrm 3051
-VMINSDrm_Int 3052
-VMINSDrr 3053
-VMINSDrr_Int 3054
-VMINSHZrm 3055
-VMINSHZrm_Int 3056
-VMINSHZrmk_Int 3057
-VMINSHZrmkz_Int 3058
-VMINSHZrr 3059
-VMINSHZrr_Int 3060
-VMINSHZrrb_Int 3061
-VMINSHZrrbk_Int 3062
-VMINSHZrrbkz_Int 3063
-VMINSHZrrk_Int 3064
-VMINSHZrrkz_Int 3065
-VMINSSZrm 3066
-VMINSSZrm_Int 3067
-VMINSSZrmk_Int 3068
-VMINSSZrmkz_Int 3069
-VMINSSZrr 3070
-VMINSSZrr_Int 3071
-VMINSSZrrb_Int 3072
-VMINSSZrrbk_Int 3073
-VMINSSZrrbkz_Int 3074
-VMINSSZrrk_Int 3075
-VMINSSZrrkz_Int 3076
-VMINSSrm 3077
-VMINSSrm_Int 3078
-VMINSSrr 3079
-VMINSSrr_Int 3080
-VMLAUNCH 3081
-VMLOAD 3082
-VMMCALL 3083
-VMOV 3084
-VMOVAPDYmr 3085
-VMOVAPDYrm 3086
-VMOVAPDYrr 3087
-VMOVAPDYrr_REV 3088
-VMOVAPDZ 3089
-VMOVAPDZmr 3090
-VMOVAPDZmrk 3091
-VMOVAPDZrm 3092
-VMOVAPDZrmk 3093
-VMOVAPDZrmkz 3094
-VMOVAPDZrr 3095
-VMOVAPDZrr_REV 3096
-VMOVAPDZrrk 3097
-VMOVAPDZrrk_REV 3098
-VMOVAPDZrrkz 3099
-VMOVAPDZrrkz_REV 3100
-VMOVAPDmr 3101
-VMOVAPDrm 3102
-VMOVAPDrr 3103
-VMOVAPDrr_REV 3104
-VMOVAPSYmr 3105
-VMOVAPSYrm 3106
-VMOVAPSYrr 3107
-VMOVAPSYrr_REV 3108
-VMOVAPSZ 3109
-VMOVAPSZmr 3110
-VMOVAPSZmrk 3111
-VMOVAPSZrm 3112
-VMOVAPSZrmk 3113
-VMOVAPSZrmkz 3114
-VMOVAPSZrr 3115
-VMOVAPSZrr_REV 3116
-VMOVAPSZrrk 3117
-VMOVAPSZrrk_REV 3118
-VMOVAPSZrrkz 3119
-VMOVAPSZrrkz_REV 3120
-VMOVAPSmr 3121
-VMOVAPSrm 3122
-VMOVAPSrr 3123
-VMOVAPSrr_REV 3124
-VMOVDDUPYrm 3125
-VMOVDDUPYrr 3126
-VMOVDDUPZ 3127
-VMOVDDUPZrm 3128
-VMOVDDUPZrmk 3129
-VMOVDDUPZrmkz 3130
-VMOVDDUPZrr 3131
-VMOVDDUPZrrk 3132
-VMOVDDUPZrrkz 3133
-VMOVDDUPrm 3134
-VMOVDDUPrr 3135
-VMOVDI 3136
-VMOVDQA 3137
-VMOVDQAYmr 3138
-VMOVDQAYrm 3139
-VMOVDQAYrr 3140
-VMOVDQAYrr_REV 3141
-VMOVDQAmr 3142
-VMOVDQArm 3143
-VMOVDQArr 3144
-VMOVDQArr_REV 3145
-VMOVDQU 3146
-VMOVDQUYmr 3147
-VMOVDQUYrm 3148
-VMOVDQUYrr 3149
-VMOVDQUYrr_REV 3150
-VMOVDQUmr 3151
-VMOVDQUrm 3152
-VMOVDQUrr 3153
-VMOVDQUrr_REV 3154
-VMOVHLPSZrr 3155
-VMOVHLPSrr 3156
-VMOVHPDZ 3157
-VMOVHPDmr 3158
-VMOVHPDrm 3159
-VMOVHPSZ 3160
-VMOVHPSmr 3161
-VMOVHPSrm 3162
-VMOVLHPSZrr 3163
-VMOVLHPSrr 3164
-VMOVLPDZ 3165
-VMOVLPDmr 3166
-VMOVLPDrm 3167
-VMOVLPSZ 3168
-VMOVLPSmr 3169
-VMOVLPSrm 3170
-VMOVMSKPDYrr 3171
-VMOVMSKPDrr 3172
-VMOVMSKPSYrr 3173
-VMOVMSKPSrr 3174
-VMOVNTDQAYrm 3175
-VMOVNTDQAZ 3176
-VMOVNTDQAZrm 3177
-VMOVNTDQArm 3178
-VMOVNTDQYmr 3179
-VMOVNTDQZ 3180
-VMOVNTDQZmr 3181
-VMOVNTDQmr 3182
-VMOVNTPDYmr 3183
-VMOVNTPDZ 3184
-VMOVNTPDZmr 3185
-VMOVNTPDmr 3186
-VMOVNTPSYmr 3187
-VMOVNTPSZ 3188
-VMOVNTPSZmr 3189
-VMOVNTPSmr 3190
-VMOVPDI 3191
-VMOVPQI 3192
-VMOVPQIto 3193
-VMOVQI 3194
-VMOVRSBZ 3195
-VMOVRSBZm 3196
-VMOVRSBZmk 3197
-VMOVRSBZmkz 3198
-VMOVRSDZ 3199
-VMOVRSDZm 3200
-VMOVRSDZmk 3201
-VMOVRSDZmkz 3202
-VMOVRSQZ 3203
-VMOVRSQZm 3204
-VMOVRSQZmk 3205
-VMOVRSQZmkz 3206
-VMOVRSWZ 3207
-VMOVRSWZm 3208
-VMOVRSWZmk 3209
-VMOVRSWZmkz 3210
-VMOVSDZmr 3211
-VMOVSDZmrk 3212
-VMOVSDZrm 3213
-VMOVSDZrm_alt 3214
-VMOVSDZrmk 3215
-VMOVSDZrmkz 3216
-VMOVSDZrr 3217
-VMOVSDZrr_REV 3218
-VMOVSDZrrk 3219
-VMOVSDZrrk_REV 3220
-VMOVSDZrrkz 3221
-VMOVSDZrrkz_REV 3222
-VMOVSDmr 3223
-VMOVSDrm 3224
-VMOVSDrm_alt 3225
-VMOVSDrr 3226
-VMOVSDrr_REV 3227
-VMOVSDto 3228
-VMOVSH 3229
-VMOVSHDUPYrm 3230
-VMOVSHDUPYrr 3231
-VMOVSHDUPZ 3232
-VMOVSHDUPZrm 3233
-VMOVSHDUPZrmk 3234
-VMOVSHDUPZrmkz 3235
-VMOVSHDUPZrr 3236
-VMOVSHDUPZrrk 3237
-VMOVSHDUPZrrkz 3238
-VMOVSHDUPrm 3239
-VMOVSHDUPrr 3240
-VMOVSHZmr 3241
-VMOVSHZmrk 3242
-VMOVSHZrm 3243
-VMOVSHZrm_alt 3244
-VMOVSHZrmk 3245
-VMOVSHZrmkz 3246
-VMOVSHZrr 3247
-VMOVSHZrr_REV 3248
-VMOVSHZrrk 3249
-VMOVSHZrrk_REV 3250
-VMOVSHZrrkz 3251
-VMOVSHZrrkz_REV 3252
-VMOVSHtoW 3253
-VMOVSLDUPYrm 3254
-VMOVSLDUPYrr 3255
-VMOVSLDUPZ 3256
-VMOVSLDUPZrm 3257
-VMOVSLDUPZrmk 3258
-VMOVSLDUPZrmkz 3259
-VMOVSLDUPZrr 3260
-VMOVSLDUPZrrk 3261
-VMOVSLDUPZrrkz 3262
-VMOVSLDUPrm 3263
-VMOVSLDUPrr 3264
-VMOVSS 3265
-VMOVSSZmr 3266
-VMOVSSZmrk 3267
-VMOVSSZrm 3268
-VMOVSSZrm_alt 3269
-VMOVSSZrmk 3270
-VMOVSSZrmkz 3271
-VMOVSSZrr 3272
-VMOVSSZrr_REV 3273
-VMOVSSZrrk 3274
-VMOVSSZrrk_REV 3275
-VMOVSSZrrkz 3276
-VMOVSSZrrkz_REV 3277
-VMOVSSmr 3278
-VMOVSSrm 3279
-VMOVSSrm_alt 3280
-VMOVSSrr 3281
-VMOVSSrr_REV 3282
-VMOVUPDYmr 3283
-VMOVUPDYrm 3284
-VMOVUPDYrr 3285
-VMOVUPDYrr_REV 3286
-VMOVUPDZ 3287
-VMOVUPDZmr 3288
-VMOVUPDZmrk 3289
-VMOVUPDZrm 3290
-VMOVUPDZrmk 3291
-VMOVUPDZrmkz 3292
-VMOVUPDZrr 3293
-VMOVUPDZrr_REV 3294
-VMOVUPDZrrk 3295
-VMOVUPDZrrk_REV 3296
-VMOVUPDZrrkz 3297
-VMOVUPDZrrkz_REV 3298
-VMOVUPDmr 3299
-VMOVUPDrm 3300
-VMOVUPDrr 3301
-VMOVUPDrr_REV 3302
-VMOVUPSYmr 3303
-VMOVUPSYrm 3304
-VMOVUPSYrr 3305
-VMOVUPSYrr_REV 3306
-VMOVUPSZ 3307
-VMOVUPSZmr 3308
-VMOVUPSZmrk 3309
-VMOVUPSZrm 3310
-VMOVUPSZrmk 3311
-VMOVUPSZrmkz 3312
-VMOVUPSZrr 3313
-VMOVUPSZrr_REV 3314
-VMOVUPSZrrk 3315
-VMOVUPSZrrk_REV 3316
-VMOVUPSZrrkz 3317
-VMOVUPSZrrkz_REV 3318
-VMOVUPSmr 3319
-VMOVUPSrm 3320
-VMOVUPSrr 3321
-VMOVUPSrr_REV 3322
-VMOVW 3323
-VMOVWmr 3324
-VMOVWrm 3325
-VMOVZPDILo 3326
-VMOVZPQILo 3327
-VMOVZPWILo 3328
-VMPSADBWYrmi 3329
-VMPSADBWYrri 3330
-VMPSADBWZ 3331
-VMPSADBWZrmi 3332
-VMPSADBWZrmik 3333
-VMPSADBWZrmikz 3334
-VMPSADBWZrri 3335
-VMPSADBWZrrik 3336
-VMPSADBWZrrikz 3337
-VMPSADBWrmi 3338
-VMPSADBWrri 3339
-VMPTRLDm 3340
-VMPTRSTm 3341
-VMREAD 3342
-VMRESUME 3343
-VMRUN 3344
-VMSAVE 3345
-VMULBF 3346
-VMULPDYrm 3347
-VMULPDYrr 3348
-VMULPDZ 3349
-VMULPDZrm 3350
-VMULPDZrmb 3351
-VMULPDZrmbk 3352
-VMULPDZrmbkz 3353
-VMULPDZrmk 3354
-VMULPDZrmkz 3355
-VMULPDZrr 3356
-VMULPDZrrb 3357
-VMULPDZrrbk 3358
-VMULPDZrrbkz 3359
-VMULPDZrrk 3360
-VMULPDZrrkz 3361
-VMULPDrm 3362
-VMULPDrr 3363
-VMULPHZ 3364
-VMULPHZrm 3365
-VMULPHZrmb 3366
-VMULPHZrmbk 3367
-VMULPHZrmbkz 3368
-VMULPHZrmk 3369
-VMULPHZrmkz 3370
-VMULPHZrr 3371
-VMULPHZrrb 3372
-VMULPHZrrbk 3373
-VMULPHZrrbkz 3374
-VMULPHZrrk 3375
-VMULPHZrrkz 3376
-VMULPSYrm 3377
-VMULPSYrr 3378
-VMULPSZ 3379
-VMULPSZrm 3380
-VMULPSZrmb 3381
-VMULPSZrmbk 3382
-VMULPSZrmbkz 3383
-VMULPSZrmk 3384
-VMULPSZrmkz 3385
-VMULPSZrr 3386
-VMULPSZrrb 3387
-VMULPSZrrbk 3388
-VMULPSZrrbkz 3389
-VMULPSZrrk 3390
-VMULPSZrrkz 3391
-VMULPSrm 3392
-VMULPSrr 3393
-VMULSDZrm 3394
-VMULSDZrm_Int 3395
-VMULSDZrmk_Int 3396
-VMULSDZrmkz_Int 3397
-VMULSDZrr 3398
-VMULSDZrr_Int 3399
-VMULSDZrrb_Int 3400
-VMULSDZrrbk_Int 3401
-VMULSDZrrbkz_Int 3402
-VMULSDZrrk_Int 3403
-VMULSDZrrkz_Int 3404
-VMULSDrm 3405
-VMULSDrm_Int 3406
-VMULSDrr 3407
-VMULSDrr_Int 3408
-VMULSHZrm 3409
-VMULSHZrm_Int 3410
-VMULSHZrmk_Int 3411
-VMULSHZrmkz_Int 3412
-VMULSHZrr 3413
-VMULSHZrr_Int 3414
-VMULSHZrrb_Int 3415
-VMULSHZrrbk_Int 3416
-VMULSHZrrbkz_Int 3417
-VMULSHZrrk_Int 3418
-VMULSHZrrkz_Int 3419
-VMULSSZrm 3420
-VMULSSZrm_Int 3421
-VMULSSZrmk_Int 3422
-VMULSSZrmkz_Int 3423
-VMULSSZrr 3424
-VMULSSZrr_Int 3425
-VMULSSZrrb_Int 3426
-VMULSSZrrbk_Int 3427
-VMULSSZrrbkz_Int 3428
-VMULSSZrrk_Int 3429
-VMULSSZrrkz_Int 3430
-VMULSSrm 3431
-VMULSSrm_Int 3432
-VMULSSrr 3433
-VMULSSrr_Int 3434
-VMWRITE 3435
-VMXOFF 3436
-VMXON 3437
-VORPDYrm 3438
-VORPDYrr 3439
-VORPDZ 3440
-VORPDZrm 3441
-VORPDZrmb 3442
-VORPDZrmbk 3443
-VORPDZrmbkz 3444
-VORPDZrmk 3445
-VORPDZrmkz 3446
-VORPDZrr 3447
-VORPDZrrk 3448
-VORPDZrrkz 3449
-VORPDrm 3450
-VORPDrr 3451
-VORPSYrm 3452
-VORPSYrr 3453
-VORPSZ 3454
-VORPSZrm 3455
-VORPSZrmb 3456
-VORPSZrmbk 3457
-VORPSZrmbkz 3458
-VORPSZrmk 3459
-VORPSZrmkz 3460
-VORPSZrr 3461
-VORPSZrrk 3462
-VORPSZrrkz 3463
-VORPSrm 3464
-VORPSrr 3465
-VP 3466
-VPABSBYrm 3467
-VPABSBYrr 3468
-VPABSBZ 3469
-VPABSBZrm 3470
-VPABSBZrmk 3471
-VPABSBZrmkz 3472
-VPABSBZrr 3473
-VPABSBZrrk 3474
-VPABSBZrrkz 3475
-VPABSBrm 3476
-VPABSBrr 3477
-VPABSDYrm 3478
-VPABSDYrr 3479
-VPABSDZ 3480
-VPABSDZrm 3481
-VPABSDZrmb 3482
-VPABSDZrmbk 3483
-VPABSDZrmbkz 3484
-VPABSDZrmk 3485
-VPABSDZrmkz 3486
-VPABSDZrr 3487
-VPABSDZrrk 3488
-VPABSDZrrkz 3489
-VPABSDrm 3490
-VPABSDrr 3491
-VPABSQZ 3492
-VPABSQZrm 3493
-VPABSQZrmb 3494
-VPABSQZrmbk 3495
-VPABSQZrmbkz 3496
-VPABSQZrmk 3497
-VPABSQZrmkz 3498
-VPABSQZrr 3499
-VPABSQZrrk 3500
-VPABSQZrrkz 3501
-VPABSWYrm 3502
-VPABSWYrr 3503
-VPABSWZ 3504
-VPABSWZrm 3505
-VPABSWZrmk 3506
-VPABSWZrmkz 3507
-VPABSWZrr 3508
-VPABSWZrrk 3509
-VPABSWZrrkz 3510
-VPABSWrm 3511
-VPABSWrr 3512
-VPACKSSDWYrm 3513
-VPACKSSDWYrr 3514
-VPACKSSDWZ 3515
-VPACKSSDWZrm 3516
-VPACKSSDWZrmb 3517
-VPACKSSDWZrmbk 3518
-VPACKSSDWZrmbkz 3519
-VPACKSSDWZrmk 3520
-VPACKSSDWZrmkz 3521
-VPACKSSDWZrr 3522
-VPACKSSDWZrrk 3523
-VPACKSSDWZrrkz 3524
-VPACKSSDWrm 3525
-VPACKSSDWrr 3526
-VPACKSSWBYrm 3527
-VPACKSSWBYrr 3528
-VPACKSSWBZ 3529
-VPACKSSWBZrm 3530
-VPACKSSWBZrmk 3531
-VPACKSSWBZrmkz 3532
-VPACKSSWBZrr 3533
-VPACKSSWBZrrk 3534
-VPACKSSWBZrrkz 3535
-VPACKSSWBrm 3536
-VPACKSSWBrr 3537
-VPACKUSDWYrm 3538
-VPACKUSDWYrr 3539
-VPACKUSDWZ 3540
-VPACKUSDWZrm 3541
-VPACKUSDWZrmb 3542
-VPACKUSDWZrmbk 3543
-VPACKUSDWZrmbkz 3544
-VPACKUSDWZrmk 3545
-VPACKUSDWZrmkz 3546
-VPACKUSDWZrr 3547
-VPACKUSDWZrrk 3548
-VPACKUSDWZrrkz 3549
-VPACKUSDWrm 3550
-VPACKUSDWrr 3551
-VPACKUSWBYrm 3552
-VPACKUSWBYrr 3553
-VPACKUSWBZ 3554
-VPACKUSWBZrm 3555
-VPACKUSWBZrmk 3556
-VPACKUSWBZrmkz 3557
-VPACKUSWBZrr 3558
-VPACKUSWBZrrk 3559
-VPACKUSWBZrrkz 3560
-VPACKUSWBrm 3561
-VPACKUSWBrr 3562
-VPADDBYrm 3563
-VPADDBYrr 3564
-VPADDBZ 3565
-VPADDBZrm 3566
-VPADDBZrmk 3567
-VPADDBZrmkz 3568
-VPADDBZrr 3569
-VPADDBZrrk 3570
-VPADDBZrrkz 3571
-VPADDBrm 3572
-VPADDBrr 3573
-VPADDDYrm 3574
-VPADDDYrr 3575
-VPADDDZ 3576
-VPADDDZrm 3577
-VPADDDZrmb 3578
-VPADDDZrmbk 3579
-VPADDDZrmbkz 3580
-VPADDDZrmk 3581
-VPADDDZrmkz 3582
-VPADDDZrr 3583
-VPADDDZrrk 3584
-VPADDDZrrkz 3585
-VPADDDrm 3586
-VPADDDrr 3587
-VPADDQYrm 3588
-VPADDQYrr 3589
-VPADDQZ 3590
-VPADDQZrm 3591
-VPADDQZrmb 3592
-VPADDQZrmbk 3593
-VPADDQZrmbkz 3594
-VPADDQZrmk 3595
-VPADDQZrmkz 3596
-VPADDQZrr 3597
-VPADDQZrrk 3598
-VPADDQZrrkz 3599
-VPADDQrm 3600
-VPADDQrr 3601
-VPADDSBYrm 3602
-VPADDSBYrr 3603
-VPADDSBZ 3604
-VPADDSBZrm 3605
-VPADDSBZrmk 3606
-VPADDSBZrmkz 3607
-VPADDSBZrr 3608
-VPADDSBZrrk 3609
-VPADDSBZrrkz 3610
-VPADDSBrm 3611
-VPADDSBrr 3612
-VPADDSWYrm 3613
-VPADDSWYrr 3614
-VPADDSWZ 3615
-VPADDSWZrm 3616
-VPADDSWZrmk 3617
-VPADDSWZrmkz 3618
-VPADDSWZrr 3619
-VPADDSWZrrk 3620
-VPADDSWZrrkz 3621
-VPADDSWrm 3622
-VPADDSWrr 3623
-VPADDUSBYrm 3624
-VPADDUSBYrr 3625
-VPADDUSBZ 3626
-VPADDUSBZrm 3627
-VPADDUSBZrmk 3628
-VPADDUSBZrmkz 3629
-VPADDUSBZrr 3630
-VPADDUSBZrrk 3631
-VPADDUSBZrrkz 3632
-VPADDUSBrm 3633
-VPADDUSBrr 3634
-VPADDUSWYrm 3635
-VPADDUSWYrr 3636
-VPADDUSWZ 3637
-VPADDUSWZrm 3638
-VPADDUSWZrmk 3639
-VPADDUSWZrmkz 3640
-VPADDUSWZrr 3641
-VPADDUSWZrrk 3642
-VPADDUSWZrrkz 3643
-VPADDUSWrm 3644
-VPADDUSWrr 3645
-VPADDWYrm 3646
-VPADDWYrr 3647
-VPADDWZ 3648
-VPADDWZrm 3649
-VPADDWZrmk 3650
-VPADDWZrmkz 3651
-VPADDWZrr 3652
-VPADDWZrrk 3653
-VPADDWZrrkz 3654
-VPADDWrm 3655
-VPADDWrr 3656
-VPALIGNRYrmi 3657
-VPALIGNRYrri 3658
-VPALIGNRZ 3659
-VPALIGNRZrmi 3660
-VPALIGNRZrmik 3661
-VPALIGNRZrmikz 3662
-VPALIGNRZrri 3663
-VPALIGNRZrrik 3664
-VPALIGNRZrrikz 3665
-VPALIGNRrmi 3666
-VPALIGNRrri 3667
-VPANDDZ 3668
-VPANDDZrm 3669
-VPANDDZrmb 3670
-VPANDDZrmbk 3671
-VPANDDZrmbkz 3672
-VPANDDZrmk 3673
-VPANDDZrmkz 3674
-VPANDDZrr 3675
-VPANDDZrrk 3676
-VPANDDZrrkz 3677
-VPANDNDZ 3678
-VPANDNDZrm 3679
-VPANDNDZrmb 3680
-VPANDNDZrmbk 3681
-VPANDNDZrmbkz 3682
-VPANDNDZrmk 3683
-VPANDNDZrmkz 3684
-VPANDNDZrr 3685
-VPANDNDZrrk 3686
-VPANDNDZrrkz 3687
-VPANDNQZ 3688
-VPANDNQZrm 3689
-VPANDNQZrmb 3690
-VPANDNQZrmbk 3691
-VPANDNQZrmbkz 3692
-VPANDNQZrmk 3693
-VPANDNQZrmkz 3694
-VPANDNQZrr 3695
-VPANDNQZrrk 3696
-VPANDNQZrrkz 3697
-VPANDNYrm 3698
-VPANDNYrr 3699
-VPANDNrm 3700
-VPANDNrr 3701
-VPANDQZ 3702
-VPANDQZrm 3703
-VPANDQZrmb 3704
-VPANDQZrmbk 3705
-VPANDQZrmbkz 3706
-VPANDQZrmk 3707
-VPANDQZrmkz 3708
-VPANDQZrr 3709
-VPANDQZrrk 3710
-VPANDQZrrkz 3711
-VPANDYrm 3712
-VPANDYrr 3713
-VPANDrm 3714
-VPANDrr 3715
-VPAVGBYrm 3716
-VPAVGBYrr 3717
-VPAVGBZ 3718
-VPAVGBZrm 3719
-VPAVGBZrmk 3720
-VPAVGBZrmkz 3721
-VPAVGBZrr 3722
-VPAVGBZrrk 3723
-VPAVGBZrrkz 3724
-VPAVGBrm 3725
-VPAVGBrr 3726
-VPAVGWYrm 3727
-VPAVGWYrr 3728
-VPAVGWZ 3729
-VPAVGWZrm 3730
-VPAVGWZrmk 3731
-VPAVGWZrmkz 3732
-VPAVGWZrr 3733
-VPAVGWZrrk 3734
-VPAVGWZrrkz 3735
-VPAVGWrm 3736
-VPAVGWrr 3737
-VPBLENDDYrmi 3738
-VPBLENDDYrri 3739
-VPBLENDDrmi 3740
-VPBLENDDrri 3741
-VPBLENDMBZ 3742
-VPBLENDMBZrm 3743
-VPBLENDMBZrmk 3744
-VPBLENDMBZrmkz 3745
-VPBLENDMBZrr 3746
-VPBLENDMBZrrk 3747
-VPBLENDMBZrrkz 3748
-VPBLENDMDZ 3749
-VPBLENDMDZrm 3750
-VPBLENDMDZrmb 3751
-VPBLENDMDZrmbk 3752
-VPBLENDMDZrmbkz 3753
-VPBLENDMDZrmk 3754
-VPBLENDMDZrmkz 3755
-VPBLENDMDZrr 3756
-VPBLENDMDZrrk 3757
-VPBLENDMDZrrkz 3758
-VPBLENDMQZ 3759
-VPBLENDMQZrm 3760
-VPBLENDMQZrmb 3761
-VPBLENDMQZrmbk 3762
-VPBLENDMQZrmbkz 3763
-VPBLENDMQZrmk 3764
-VPBLENDMQZrmkz 3765
-VPBLENDMQZrr 3766
-VPBLENDMQZrrk 3767
-VPBLENDMQZrrkz 3768
-VPBLENDMWZ 3769
-VPBLENDMWZrm 3770
-VPBLENDMWZrmk 3771
-VPBLENDMWZrmkz 3772
-VPBLENDMWZrr 3773
-VPBLENDMWZrrk 3774
-VPBLENDMWZrrkz 3775
-VPBLENDVBYrmr 3776
-VPBLENDVBYrrr 3777
-VPBLENDVBrmr 3778
-VPBLENDVBrrr 3779
-VPBLENDWYrmi 3780
-VPBLENDWYrri 3781
-VPBLENDWrmi 3782
-VPBLENDWrri 3783
-VPBROADCASTBYrm 3784
-VPBROADCASTBYrr 3785
-VPBROADCASTBZ 3786
-VPBROADCASTBZrm 3787
-VPBROADCASTBZrmk 3788
-VPBROADCASTBZrmkz 3789
-VPBROADCASTBZrr 3790
-VPBROADCASTBZrrk 3791
-VPBROADCASTBZrrkz 3792
-VPBROADCASTBrZ 3793
-VPBROADCASTBrZrr 3794
-VPBROADCASTBrZrrk 3795
-VPBROADCASTBrZrrkz 3796
-VPBROADCASTBrm 3797
-VPBROADCASTBrr 3798
-VPBROADCASTDYrm 3799
-VPBROADCASTDYrr 3800
-VPBROADCASTDZ 3801
-VPBROADCASTDZrm 3802
-VPBROADCASTDZrmk 3803
-VPBROADCASTDZrmkz 3804
-VPBROADCASTDZrr 3805
-VPBROADCASTDZrrk 3806
-VPBROADCASTDZrrkz 3807
-VPBROADCASTDrZ 3808
-VPBROADCASTDrZrr 3809
-VPBROADCASTDrZrrk 3810
-VPBROADCASTDrZrrkz 3811
-VPBROADCASTDrm 3812
-VPBROADCASTDrr 3813
-VPBROADCASTMB 3814
-VPBROADCASTMW 3815
-VPBROADCASTQYrm 3816
-VPBROADCASTQYrr 3817
-VPBROADCASTQZ 3818
-VPBROADCASTQZrm 3819
-VPBROADCASTQZrmk 3820
-VPBROADCASTQZrmkz 3821
-VPBROADCASTQZrr 3822
-VPBROADCASTQZrrk 3823
-VPBROADCASTQZrrkz 3824
-VPBROADCASTQrZ 3825
-VPBROADCASTQrZrr 3826
-VPBROADCASTQrZrrk 3827
-VPBROADCASTQrZrrkz 3828
-VPBROADCASTQrm 3829
-VPBROADCASTQrr 3830
-VPBROADCASTWYrm 3831
-VPBROADCASTWYrr 3832
-VPBROADCASTWZ 3833
-VPBROADCASTWZrm 3834
-VPBROADCASTWZrmk 3835
-VPBROADCASTWZrmkz 3836
-VPBROADCASTWZrr 3837
-VPBROADCASTWZrrk 3838
-VPBROADCASTWZrrkz 3839
-VPBROADCASTWrZ 3840
-VPBROADCASTWrZrr 3841
-VPBROADCASTWrZrrk 3842
-VPBROADCASTWrZrrkz 3843
-VPBROADCASTWrm 3844
-VPBROADCASTWrr 3845
-VPCLMULQDQYrmi 3846
-VPCLMULQDQYrri 3847
-VPCLMULQDQZ 3848
-VPCLMULQDQZrmi 3849
-VPCLMULQDQZrri 3850
-VPCLMULQDQrmi 3851
-VPCLMULQDQrri 3852
-VPCMOVYrmr 3853
-VPCMOVYrrm 3854
-VPCMOVYrrr 3855
-VPCMOVYrrr_REV 3856
-VPCMOVrmr 3857
-VPCMOVrrm 3858
-VPCMOVrrr 3859
-VPCMOVrrr_REV 3860
-VPCMPBZ 3861
-VPCMPBZrmi 3862
-VPCMPBZrmik 3863
-VPCMPBZrri 3864
-VPCMPBZrrik 3865
-VPCMPDZ 3866
-VPCMPDZrmbi 3867
-VPCMPDZrmbik 3868
-VPCMPDZrmi 3869
-VPCMPDZrmik 3870
-VPCMPDZrri 3871
-VPCMPDZrrik 3872
-VPCMPEQBYrm 3873
-VPCMPEQBYrr 3874
-VPCMPEQBZ 3875
-VPCMPEQBZrm 3876
-VPCMPEQBZrmk 3877
-VPCMPEQBZrr 3878
-VPCMPEQBZrrk 3879
-VPCMPEQBrm 3880
-VPCMPEQBrr 3881
-VPCMPEQDYrm 3882
-VPCMPEQDYrr 3883
-VPCMPEQDZ 3884
-VPCMPEQDZrm 3885
-VPCMPEQDZrmb 3886
-VPCMPEQDZrmbk 3887
-VPCMPEQDZrmk 3888
-VPCMPEQDZrr 3889
-VPCMPEQDZrrk 3890
-VPCMPEQDrm 3891
-VPCMPEQDrr 3892
-VPCMPEQQYrm 3893
-VPCMPEQQYrr 3894
-VPCMPEQQZ 3895
-VPCMPEQQZrm 3896
-VPCMPEQQZrmb 3897
-VPCMPEQQZrmbk 3898
-VPCMPEQQZrmk 3899
-VPCMPEQQZrr 3900
-VPCMPEQQZrrk 3901
-VPCMPEQQrm 3902
-VPCMPEQQrr 3903
-VPCMPEQWYrm 3904
-VPCMPEQWYrr 3905
-VPCMPEQWZ 3906
-VPCMPEQWZrm 3907
-VPCMPEQWZrmk 3908
-VPCMPEQWZrr 3909
-VPCMPEQWZrrk 3910
-VPCMPEQWrm 3911
-VPCMPEQWrr 3912
-VPCMPESTRIrmi 3913
-VPCMPESTRIrri 3914
-VPCMPESTRMrmi 3915
-VPCMPESTRMrri 3916
-VPCMPGTBYrm 3917
-VPCMPGTBYrr 3918
-VPCMPGTBZ 3919
-VPCMPGTBZrm 3920
-VPCMPGTBZrmk 3921
-VPCMPGTBZrr 3922
-VPCMPGTBZrrk 3923
-VPCMPGTBrm 3924
-VPCMPGTBrr 3925
-VPCMPGTDYrm 3926
-VPCMPGTDYrr 3927
-VPCMPGTDZ 3928
-VPCMPGTDZrm 3929
-VPCMPGTDZrmb 3930
-VPCMPGTDZrmbk 3931
-VPCMPGTDZrmk 3932
-VPCMPGTDZrr 3933
-VPCMPGTDZrrk 3934
-VPCMPGTDrm 3935
-VPCMPGTDrr 3936
-VPCMPGTQYrm 3937
-VPCMPGTQYrr 3938
-VPCMPGTQZ 3939
-VPCMPGTQZrm 3940
-VPCMPGTQZrmb 3941
-VPCMPGTQZrmbk 3942
-VPCMPGTQZrmk 3943
-VPCMPGTQZrr 3944
-VPCMPGTQZrrk 3945
-VPCMPGTQrm 3946
-VPCMPGTQrr 3947
-VPCMPGTWYrm 3948
-VPCMPGTWYrr 3949
-VPCMPGTWZ 3950
-VPCMPGTWZrm 3951
-VPCMPGTWZrmk 3952
-VPCMPGTWZrr 3953
-VPCMPGTWZrrk 3954
-VPCMPGTWrm 3955
-VPCMPGTWrr 3956
-VPCMPISTRIrmi 3957
-VPCMPISTRIrri 3958
-VPCMPISTRMrmi 3959
-VPCMPISTRMrri 3960
-VPCMPQZ 3961
-VPCMPQZrmbi 3962
-VPCMPQZrmbik 3963
-VPCMPQZrmi 3964
-VPCMPQZrmik 3965
-VPCMPQZrri 3966
-VPCMPQZrrik 3967
-VPCMPUBZ 3968
-VPCMPUBZrmi 3969
-VPCMPUBZrmik 3970
-VPCMPUBZrri 3971
-VPCMPUBZrrik 3972
-VPCMPUDZ 3973
-VPCMPUDZrmbi 3974
-VPCMPUDZrmbik 3975
-VPCMPUDZrmi 3976
-VPCMPUDZrmik 3977
-VPCMPUDZrri 3978
-VPCMPUDZrrik 3979
-VPCMPUQZ 3980
-VPCMPUQZrmbi 3981
-VPCMPUQZrmbik 3982
-VPCMPUQZrmi 3983
-VPCMPUQZrmik 3984
-VPCMPUQZrri 3985
-VPCMPUQZrrik 3986
-VPCMPUWZ 3987
-VPCMPUWZrmi 3988
-VPCMPUWZrmik 3989
-VPCMPUWZrri 3990
-VPCMPUWZrrik 3991
-VPCMPWZ 3992
-VPCMPWZrmi 3993
-VPCMPWZrmik 3994
-VPCMPWZrri 3995
-VPCMPWZrrik 3996
-VPCOMBmi 3997
-VPCOMBri 3998
-VPCOMDmi 3999
-VPCOMDri 4000
-VPCOMPRESSBZ 4001
-VPCOMPRESSBZmr 4002
-VPCOMPRESSBZmrk 4003
-VPCOMPRESSBZrr 4004
-VPCOMPRESSBZrrk 4005
-VPCOMPRESSBZrrkz 4006
-VPCOMPRESSDZ 4007
-VPCOMPRESSDZmr 4008
-VPCOMPRESSDZmrk 4009
-VPCOMPRESSDZrr 4010
-VPCOMPRESSDZrrk 4011
-VPCOMPRESSDZrrkz 4012
-VPCOMPRESSQZ 4013
-VPCOMPRESSQZmr 4014
-VPCOMPRESSQZmrk 4015
-VPCOMPRESSQZrr 4016
-VPCOMPRESSQZrrk 4017
-VPCOMPRESSQZrrkz 4018
-VPCOMPRESSWZ 4019
-VPCOMPRESSWZmr 4020
-VPCOMPRESSWZmrk 4021
-VPCOMPRESSWZrr 4022
-VPCOMPRESSWZrrk 4023
-VPCOMPRESSWZrrkz 4024
-VPCOMQmi 4025
-VPCOMQri 4026
-VPCOMUBmi 4027
-VPCOMUBri 4028
-VPCOMUDmi 4029
-VPCOMUDri 4030
-VPCOMUQmi 4031
-VPCOMUQri 4032
-VPCOMUWmi 4033
-VPCOMUWri 4034
-VPCOMWmi 4035
-VPCOMWri 4036
-VPCONFLICTDZ 4037
-VPCONFLICTDZrm 4038
-VPCONFLICTDZrmb 4039
-VPCONFLICTDZrmbk 4040
-VPCONFLICTDZrmbkz 4041
-VPCONFLICTDZrmk 4042
-VPCONFLICTDZrmkz 4043
-VPCONFLICTDZrr 4044
-VPCONFLICTDZrrk 4045
-VPCONFLICTDZrrkz 4046
-VPCONFLICTQZ 4047
-VPCONFLICTQZrm 4048
-VPCONFLICTQZrmb 4049
-VPCONFLICTQZrmbk 4050
-VPCONFLICTQZrmbkz 4051
-VPCONFLICTQZrmk 4052
-VPCONFLICTQZrmkz 4053
-VPCONFLICTQZrr 4054
-VPCONFLICTQZrrk 4055
-VPCONFLICTQZrrkz 4056
-VPDPBSSDSYrm 4057
-VPDPBSSDSYrr 4058
-VPDPBSSDSZ 4059
-VPDPBSSDSZrm 4060
-VPDPBSSDSZrmb 4061
-VPDPBSSDSZrmbk 4062
-VPDPBSSDSZrmbkz 4063
-VPDPBSSDSZrmk 4064
-VPDPBSSDSZrmkz 4065
-VPDPBSSDSZrr 4066
-VPDPBSSDSZrrk 4067
-VPDPBSSDSZrrkz 4068
-VPDPBSSDSrm 4069
-VPDPBSSDSrr 4070
-VPDPBSSDYrm 4071
-VPDPBSSDYrr 4072
-VPDPBSSDZ 4073
-VPDPBSSDZrm 4074
-VPDPBSSDZrmb 4075
-VPDPBSSDZrmbk 4076
-VPDPBSSDZrmbkz 4077
-VPDPBSSDZrmk 4078
-VPDPBSSDZrmkz 4079
-VPDPBSSDZrr 4080
-VPDPBSSDZrrk 4081
-VPDPBSSDZrrkz 4082
-VPDPBSSDrm 4083
-VPDPBSSDrr 4084
-VPDPBSUDSYrm 4085
-VPDPBSUDSYrr 4086
-VPDPBSUDSZ 4087
-VPDPBSUDSZrm 4088
-VPDPBSUDSZrmb 4089
-VPDPBSUDSZrmbk 4090
-VPDPBSUDSZrmbkz 4091
-VPDPBSUDSZrmk 4092
-VPDPBSUDSZrmkz 4093
-VPDPBSUDSZrr 4094
-VPDPBSUDSZrrk 4095
-VPDPBSUDSZrrkz 4096
-VPDPBSUDSrm 4097
-VPDPBSUDSrr 4098
-VPDPBSUDYrm 4099
-VPDPBSUDYrr 4100
-VPDPBSUDZ 4101
-VPDPBSUDZrm 4102
-VPDPBSUDZrmb 4103
-VPDPBSUDZrmbk 4104
-VPDPBSUDZrmbkz 4105
-VPDPBSUDZrmk 4106
-VPDPBSUDZrmkz 4107
-VPDPBSUDZrr 4108
-VPDPBSUDZrrk 4109
-VPDPBSUDZrrkz 4110
-VPDPBSUDrm 4111
-VPDPBSUDrr 4112
-VPDPBUSDSYrm 4113
-VPDPBUSDSYrr 4114
-VPDPBUSDSZ 4115
-VPDPBUSDSZrm 4116
-VPDPBUSDSZrmb 4117
-VPDPBUSDSZrmbk 4118
-VPDPBUSDSZrmbkz 4119
-VPDPBUSDSZrmk 4120
-VPDPBUSDSZrmkz 4121
-VPDPBUSDSZrr 4122
-VPDPBUSDSZrrk 4123
-VPDPBUSDSZrrkz 4124
-VPDPBUSDSrm 4125
-VPDPBUSDSrr 4126
-VPDPBUSDYrm 4127
-VPDPBUSDYrr 4128
-VPDPBUSDZ 4129
-VPDPBUSDZrm 4130
-VPDPBUSDZrmb 4131
-VPDPBUSDZrmbk 4132
-VPDPBUSDZrmbkz 4133
-VPDPBUSDZrmk 4134
-VPDPBUSDZrmkz 4135
-VPDPBUSDZrr 4136
-VPDPBUSDZrrk 4137
-VPDPBUSDZrrkz 4138
-VPDPBUSDrm 4139
-VPDPBUSDrr 4140
-VPDPBUUDSYrm 4141
-VPDPBUUDSYrr 4142
-VPDPBUUDSZ 4143
-VPDPBUUDSZrm 4144
-VPDPBUUDSZrmb 4145
-VPDPBUUDSZrmbk 4146
-VPDPBUUDSZrmbkz 4147
-VPDPBUUDSZrmk 4148
-VPDPBUUDSZrmkz 4149
-VPDPBUUDSZrr 4150
-VPDPBUUDSZrrk 4151
-VPDPBUUDSZrrkz 4152
-VPDPBUUDSrm 4153
-VPDPBUUDSrr 4154
-VPDPBUUDYrm 4155
-VPDPBUUDYrr 4156
-VPDPBUUDZ 4157
-VPDPBUUDZrm 4158
-VPDPBUUDZrmb 4159
-VPDPBUUDZrmbk 4160
-VPDPBUUDZrmbkz 4161
-VPDPBUUDZrmk 4162
-VPDPBUUDZrmkz 4163
-VPDPBUUDZrr 4164
-VPDPBUUDZrrk 4165
-VPDPBUUDZrrkz 4166
-VPDPBUUDrm 4167
-VPDPBUUDrr 4168
-VPDPWSSDSYrm 4169
-VPDPWSSDSYrr 4170
-VPDPWSSDSZ 4171
-VPDPWSSDSZrm 4172
-VPDPWSSDSZrmb 4173
-VPDPWSSDSZrmbk 4174
-VPDPWSSDSZrmbkz 4175
-VPDPWSSDSZrmk 4176
-VPDPWSSDSZrmkz 4177
-VPDPWSSDSZrr 4178
-VPDPWSSDSZrrk 4179
-VPDPWSSDSZrrkz 4180
-VPDPWSSDSrm 4181
-VPDPWSSDSrr 4182
-VPDPWSSDYrm 4183
-VPDPWSSDYrr 4184
-VPDPWSSDZ 4185
-VPDPWSSDZrm 4186
-VPDPWSSDZrmb 4187
-VPDPWSSDZrmbk 4188
-VPDPWSSDZrmbkz 4189
-VPDPWSSDZrmk 4190
-VPDPWSSDZrmkz 4191
-VPDPWSSDZrr 4192
-VPDPWSSDZrrk 4193
-VPDPWSSDZrrkz 4194
-VPDPWSSDrm 4195
-VPDPWSSDrr 4196
-VPDPWSUDSYrm 4197
-VPDPWSUDSYrr 4198
-VPDPWSUDSZ 4199
-VPDPWSUDSZrm 4200
-VPDPWSUDSZrmb 4201
-VPDPWSUDSZrmbk 4202
-VPDPWSUDSZrmbkz 4203
-VPDPWSUDSZrmk 4204
-VPDPWSUDSZrmkz 4205
-VPDPWSUDSZrr 4206
-VPDPWSUDSZrrk 4207
-VPDPWSUDSZrrkz 4208
-VPDPWSUDSrm 4209
-VPDPWSUDSrr 4210
-VPDPWSUDYrm 4211
-VPDPWSUDYrr 4212
-VPDPWSUDZ 4213
-VPDPWSUDZrm 4214
-VPDPWSUDZrmb 4215
-VPDPWSUDZrmbk 4216
-VPDPWSUDZrmbkz 4217
-VPDPWSUDZrmk 4218
-VPDPWSUDZrmkz 4219
-VPDPWSUDZrr 4220
-VPDPWSUDZrrk 4221
-VPDPWSUDZrrkz 4222
-VPDPWSUDrm 4223
-VPDPWSUDrr 4224
-VPDPWUSDSYrm 4225
-VPDPWUSDSYrr 4226
-VPDPWUSDSZ 4227
-VPDPWUSDSZrm 4228
-VPDPWUSDSZrmb 4229
-VPDPWUSDSZrmbk 4230
-VPDPWUSDSZrmbkz 4231
-VPDPWUSDSZrmk 4232
-VPDPWUSDSZrmkz 4233
-VPDPWUSDSZrr 4234
-VPDPWUSDSZrrk 4235
-VPDPWUSDSZrrkz 4236
-VPDPWUSDSrm 4237
-VPDPWUSDSrr 4238
-VPDPWUSDYrm 4239
-VPDPWUSDYrr 4240
-VPDPWUSDZ 4241
-VPDPWUSDZrm 4242
-VPDPWUSDZrmb 4243
-VPDPWUSDZrmbk 4244
-VPDPWUSDZrmbkz 4245
-VPDPWUSDZrmk 4246
-VPDPWUSDZrmkz 4247
-VPDPWUSDZrr 4248
-VPDPWUSDZrrk 4249
-VPDPWUSDZrrkz 4250
-VPDPWUSDrm 4251
-VPDPWUSDrr 4252
-VPDPWUUDSYrm 4253
-VPDPWUUDSYrr 4254
-VPDPWUUDSZ 4255
-VPDPWUUDSZrm 4256
-VPDPWUUDSZrmb 4257
-VPDPWUUDSZrmbk 4258
-VPDPWUUDSZrmbkz 4259
-VPDPWUUDSZrmk 4260
-VPDPWUUDSZrmkz 4261
-VPDPWUUDSZrr 4262
-VPDPWUUDSZrrk 4263
-VPDPWUUDSZrrkz 4264
-VPDPWUUDSrm 4265
-VPDPWUUDSrr 4266
-VPDPWUUDYrm 4267
-VPDPWUUDYrr 4268
-VPDPWUUDZ 4269
-VPDPWUUDZrm 4270
-VPDPWUUDZrmb 4271
-VPDPWUUDZrmbk 4272
-VPDPWUUDZrmbkz 4273
-VPDPWUUDZrmk 4274
-VPDPWUUDZrmkz 4275
-VPDPWUUDZrr 4276
-VPDPWUUDZrrk 4277
-VPDPWUUDZrrkz 4278
-VPDPWUUDrm 4279
-VPDPWUUDrr 4280
-VPERM 4281
-VPERMBZ 4282
-VPERMBZrm 4283
-VPERMBZrmk 4284
-VPERMBZrmkz 4285
-VPERMBZrr 4286
-VPERMBZrrk 4287
-VPERMBZrrkz 4288
-VPERMDYrm 4289
-VPERMDYrr 4290
-VPERMDZ 4291
-VPERMDZrm 4292
-VPERMDZrmb 4293
-VPERMDZrmbk 4294
-VPERMDZrmbkz 4295
-VPERMDZrmk 4296
-VPERMDZrmkz 4297
-VPERMDZrr 4298
-VPERMDZrrk 4299
-VPERMDZrrkz 4300
-VPERMI 4301
-VPERMIL 4302
-VPERMILPDYmi 4303
-VPERMILPDYri 4304
-VPERMILPDYrm 4305
-VPERMILPDYrr 4306
-VPERMILPDZ 4307
-VPERMILPDZmbi 4308
-VPERMILPDZmbik 4309
-VPERMILPDZmbikz 4310
-VPERMILPDZmi 4311
-VPERMILPDZmik 4312
-VPERMILPDZmikz 4313
-VPERMILPDZri 4314
-VPERMILPDZrik 4315
-VPERMILPDZrikz 4316
-VPERMILPDZrm 4317
-VPERMILPDZrmb 4318
-VPERMILPDZrmbk 4319
-VPERMILPDZrmbkz 4320
-VPERMILPDZrmk 4321
-VPERMILPDZrmkz 4322
-VPERMILPDZrr 4323
-VPERMILPDZrrk 4324
-VPERMILPDZrrkz 4325
-VPERMILPDmi 4326
-VPERMILPDri 4327
-VPERMILPDrm 4328
-VPERMILPDrr 4329
-VPERMILPSYmi 4330
-VPERMILPSYri 4331
-VPERMILPSYrm 4332
-VPERMILPSYrr 4333
-VPERMILPSZ 4334
-VPERMILPSZmbi 4335
-VPERMILPSZmbik 4336
-VPERMILPSZmbikz 4337
-VPERMILPSZmi 4338
-VPERMILPSZmik 4339
-VPERMILPSZmikz 4340
-VPERMILPSZri 4341
-VPERMILPSZrik 4342
-VPERMILPSZrikz 4343
-VPERMILPSZrm 4344
-VPERMILPSZrmb 4345
-VPERMILPSZrmbk 4346
-VPERMILPSZrmbkz 4347
-VPERMILPSZrmk 4348
-VPERMILPSZrmkz 4349
-VPERMILPSZrr 4350
-VPERMILPSZrrk 4351
-VPERMILPSZrrkz 4352
-VPERMILPSmi 4353
-VPERMILPSri 4354
-VPERMILPSrm 4355
-VPERMILPSrr 4356
-VPERMPDYmi 4357
-VPERMPDYri 4358
-VPERMPDZ 4359
-VPERMPDZmbi 4360
-VPERMPDZmbik 4361
-VPERMPDZmbikz 4362
-VPERMPDZmi 4363
-VPERMPDZmik 4364
-VPERMPDZmikz 4365
-VPERMPDZri 4366
-VPERMPDZrik 4367
-VPERMPDZrikz 4368
-VPERMPDZrm 4369
-VPERMPDZrmb 4370
-VPERMPDZrmbk 4371
-VPERMPDZrmbkz 4372
-VPERMPDZrmk 4373
-VPERMPDZrmkz 4374
-VPERMPDZrr 4375
-VPERMPDZrrk 4376
-VPERMPDZrrkz 4377
-VPERMPSYrm 4378
-VPERMPSYrr 4379
-VPERMPSZ 4380
-VPERMPSZrm 4381
-VPERMPSZrmb 4382
-VPERMPSZrmbk 4383
-VPERMPSZrmbkz 4384
-VPERMPSZrmk 4385
-VPERMPSZrmkz 4386
-VPERMPSZrr 4387
-VPERMPSZrrk 4388
-VPERMPSZrrkz 4389
-VPERMQYmi 4390
-VPERMQYri 4391
-VPERMQZ 4392
-VPERMQZmbi 4393
-VPERMQZmbik 4394
-VPERMQZmbikz 4395
-VPERMQZmi 4396
-VPERMQZmik 4397
-VPERMQZmikz 4398
-VPERMQZri 4399
-VPERMQZrik 4400
-VPERMQZrikz 4401
-VPERMQZrm 4402
-VPERMQZrmb 4403
-VPERMQZrmbk 4404
-VPERMQZrmbkz 4405
-VPERMQZrmk 4406
-VPERMQZrmkz 4407
-VPERMQZrr 4408
-VPERMQZrrk 4409
-VPERMQZrrkz 4410
-VPERMT 4411
-VPERMWZ 4412
-VPERMWZrm 4413
-VPERMWZrmk 4414
-VPERMWZrmkz 4415
-VPERMWZrr 4416
-VPERMWZrrk 4417
-VPERMWZrrkz 4418
-VPEXPANDBZ 4419
-VPEXPANDBZrm 4420
-VPEXPANDBZrmk 4421
-VPEXPANDBZrmkz 4422
-VPEXPANDBZrr 4423
-VPEXPANDBZrrk 4424
-VPEXPANDBZrrkz 4425
-VPEXPANDDZ 4426
-VPEXPANDDZrm 4427
-VPEXPANDDZrmk 4428
-VPEXPANDDZrmkz 4429
-VPEXPANDDZrr 4430
-VPEXPANDDZrrk 4431
-VPEXPANDDZrrkz 4432
-VPEXPANDQZ 4433
-VPEXPANDQZrm 4434
-VPEXPANDQZrmk 4435
-VPEXPANDQZrmkz 4436
-VPEXPANDQZrr 4437
-VPEXPANDQZrrk 4438
-VPEXPANDQZrrkz 4439
-VPEXPANDWZ 4440
-VPEXPANDWZrm 4441
-VPEXPANDWZrmk 4442
-VPEXPANDWZrmkz 4443
-VPEXPANDWZrr 4444
-VPEXPANDWZrrk 4445
-VPEXPANDWZrrkz 4446
-VPEXTRBZmri 4447
-VPEXTRBZrri 4448
-VPEXTRBmri 4449
-VPEXTRBrri 4450
-VPEXTRDZmri 4451
-VPEXTRDZrri 4452
-VPEXTRDmri 4453
-VPEXTRDrri 4454
-VPEXTRQZmri 4455
-VPEXTRQZrri 4456
-VPEXTRQmri 4457
-VPEXTRQrri 4458
-VPEXTRWZmri 4459
-VPEXTRWZrri 4460
-VPEXTRWZrri_REV 4461
-VPEXTRWmri 4462
-VPEXTRWrri 4463
-VPEXTRWrri_REV 4464
-VPGATHERDDYrm 4465
-VPGATHERDDZ 4466
-VPGATHERDDZrm 4467
-VPGATHERDDrm 4468
-VPGATHERDQYrm 4469
-VPGATHERDQZ 4470
-VPGATHERDQZrm 4471
-VPGATHERDQrm 4472
-VPGATHERQDYrm 4473
-VPGATHERQDZ 4474
-VPGATHERQDZrm 4475
-VPGATHERQDrm 4476
-VPGATHERQQYrm 4477
-VPGATHERQQZ 4478
-VPGATHERQQZrm 4479
-VPGATHERQQrm 4480
-VPHADDBDrm 4481
-VPHADDBDrr 4482
-VPHADDBQrm 4483
-VPHADDBQrr 4484
-VPHADDBWrm 4485
-VPHADDBWrr 4486
-VPHADDDQrm 4487
-VPHADDDQrr 4488
-VPHADDDYrm 4489
-VPHADDDYrr 4490
-VPHADDDrm 4491
-VPHADDDrr 4492
-VPHADDSWYrm 4493
-VPHADDSWYrr 4494
-VPHADDSWrm 4495
-VPHADDSWrr 4496
-VPHADDUBDrm 4497
-VPHADDUBDrr 4498
-VPHADDUBQrm 4499
-VPHADDUBQrr 4500
-VPHADDUBWrm 4501
-VPHADDUBWrr 4502
-VPHADDUDQrm 4503
-VPHADDUDQrr 4504
-VPHADDUWDrm 4505
-VPHADDUWDrr 4506
-VPHADDUWQrm 4507
-VPHADDUWQrr 4508
-VPHADDWDrm 4509
-VPHADDWDrr 4510
-VPHADDWQrm 4511
-VPHADDWQrr 4512
-VPHADDWYrm 4513
-VPHADDWYrr 4514
-VPHADDWrm 4515
-VPHADDWrr 4516
-VPHMINPOSUWrm 4517
-VPHMINPOSUWrr 4518
-VPHSUBBWrm 4519
-VPHSUBBWrr 4520
-VPHSUBDQrm 4521
-VPHSUBDQrr 4522
-VPHSUBDYrm 4523
-VPHSUBDYrr 4524
-VPHSUBDrm 4525
-VPHSUBDrr 4526
-VPHSUBSWYrm 4527
-VPHSUBSWYrr 4528
-VPHSUBSWrm 4529
-VPHSUBSWrr 4530
-VPHSUBWDrm 4531
-VPHSUBWDrr 4532
-VPHSUBWYrm 4533
-VPHSUBWYrr 4534
-VPHSUBWrm 4535
-VPHSUBWrr 4536
-VPINSRBZrmi 4537
-VPINSRBZrri 4538
-VPINSRBrmi 4539
-VPINSRBrri 4540
-VPINSRDZrmi 4541
-VPINSRDZrri 4542
-VPINSRDrmi 4543
-VPINSRDrri 4544
-VPINSRQZrmi 4545
-VPINSRQZrri 4546
-VPINSRQrmi 4547
-VPINSRQrri 4548
-VPINSRWZrmi 4549
-VPINSRWZrri 4550
-VPINSRWrmi 4551
-VPINSRWrri 4552
-VPLZCNTDZ 4553
-VPLZCNTDZrm 4554
-VPLZCNTDZrmb 4555
-VPLZCNTDZrmbk 4556
-VPLZCNTDZrmbkz 4557
-VPLZCNTDZrmk 4558
-VPLZCNTDZrmkz 4559
-VPLZCNTDZrr 4560
-VPLZCNTDZrrk 4561
-VPLZCNTDZrrkz 4562
-VPLZCNTQZ 4563
-VPLZCNTQZrm 4564
-VPLZCNTQZrmb 4565
-VPLZCNTQZrmbk 4566
-VPLZCNTQZrmbkz 4567
-VPLZCNTQZrmk 4568
-VPLZCNTQZrmkz 4569
-VPLZCNTQZrr 4570
-VPLZCNTQZrrk 4571
-VPLZCNTQZrrkz 4572
-VPMACSDDrm 4573
-VPMACSDDrr 4574
-VPMACSDQHrm 4575
-VPMACSDQHrr 4576
-VPMACSDQLrm 4577
-VPMACSDQLrr 4578
-VPMACSSDDrm 4579
-VPMACSSDDrr 4580
-VPMACSSDQHrm 4581
-VPMACSSDQHrr 4582
-VPMACSSDQLrm 4583
-VPMACSSDQLrr 4584
-VPMACSSWDrm 4585
-VPMACSSWDrr 4586
-VPMACSSWWrm 4587
-VPMACSSWWrr 4588
-VPMACSWDrm 4589
-VPMACSWDrr 4590
-VPMACSWWrm 4591
-VPMACSWWrr 4592
-VPMADCSSWDrm 4593
-VPMADCSSWDrr 4594
-VPMADCSWDrm 4595
-VPMADCSWDrr 4596
-VPMADD 4597
-VPMADDUBSWYrm 4598
-VPMADDUBSWYrr 4599
-VPMADDUBSWZ 4600
-VPMADDUBSWZrm 4601
-VPMADDUBSWZrmk 4602
-VPMADDUBSWZrmkz 4603
-VPMADDUBSWZrr 4604
-VPMADDUBSWZrrk 4605
-VPMADDUBSWZrrkz 4606
-VPMADDUBSWrm 4607
-VPMADDUBSWrr 4608
-VPMADDWDYrm 4609
-VPMADDWDYrr 4610
-VPMADDWDZ 4611
-VPMADDWDZrm 4612
-VPMADDWDZrmk 4613
-VPMADDWDZrmkz 4614
-VPMADDWDZrr 4615
-VPMADDWDZrrk 4616
-VPMADDWDZrrkz 4617
-VPMADDWDrm 4618
-VPMADDWDrr 4619
-VPMASKMOVDYmr 4620
-VPMASKMOVDYrm 4621
-VPMASKMOVDmr 4622
-VPMASKMOVDrm 4623
-VPMASKMOVQYmr 4624
-VPMASKMOVQYrm 4625
-VPMASKMOVQmr 4626
-VPMASKMOVQrm 4627
-VPMAXSBYrm 4628
-VPMAXSBYrr 4629
-VPMAXSBZ 4630
-VPMAXSBZrm 4631
-VPMAXSBZrmk 4632
-VPMAXSBZrmkz 4633
-VPMAXSBZrr 4634
-VPMAXSBZrrk 4635
-VPMAXSBZrrkz 4636
-VPMAXSBrm 4637
-VPMAXSBrr 4638
-VPMAXSDYrm 4639
-VPMAXSDYrr 4640
-VPMAXSDZ 4641
-VPMAXSDZrm 4642
-VPMAXSDZrmb 4643
-VPMAXSDZrmbk 4644
-VPMAXSDZrmbkz 4645
-VPMAXSDZrmk 4646
-VPMAXSDZrmkz 4647
-VPMAXSDZrr 4648
-VPMAXSDZrrk 4649
-VPMAXSDZrrkz 4650
-VPMAXSDrm 4651
-VPMAXSDrr 4652
-VPMAXSQZ 4653
-VPMAXSQZrm 4654
-VPMAXSQZrmb 4655
-VPMAXSQZrmbk 4656
-VPMAXSQZrmbkz 4657
-VPMAXSQZrmk 4658
-VPMAXSQZrmkz 4659
-VPMAXSQZrr 4660
-VPMAXSQZrrk 4661
-VPMAXSQZrrkz 4662
-VPMAXSWYrm 4663
-VPMAXSWYrr 4664
-VPMAXSWZ 4665
-VPMAXSWZrm 4666
-VPMAXSWZrmk 4667
-VPMAXSWZrmkz 4668
-VPMAXSWZrr 4669
-VPMAXSWZrrk 4670
-VPMAXSWZrrkz 4671
-VPMAXSWrm 4672
-VPMAXSWrr 4673
-VPMAXUBYrm 4674
-VPMAXUBYrr 4675
-VPMAXUBZ 4676
-VPMAXUBZrm 4677
-VPMAXUBZrmk 4678
-VPMAXUBZrmkz 4679
-VPMAXUBZrr 4680
-VPMAXUBZrrk 4681
-VPMAXUBZrrkz 4682
-VPMAXUBrm 4683
-VPMAXUBrr 4684
-VPMAXUDYrm 4685
-VPMAXUDYrr 4686
-VPMAXUDZ 4687
-VPMAXUDZrm 4688
-VPMAXUDZrmb 4689
-VPMAXUDZrmbk 4690
-VPMAXUDZrmbkz 4691
-VPMAXUDZrmk 4692
-VPMAXUDZrmkz 4693
-VPMAXUDZrr 4694
-VPMAXUDZrrk 4695
-VPMAXUDZrrkz 4696
-VPMAXUDrm 4697
-VPMAXUDrr 4698
-VPMAXUQZ 4699
-VPMAXUQZrm 4700
-VPMAXUQZrmb 4701
-VPMAXUQZrmbk 4702
-VPMAXUQZrmbkz 4703
-VPMAXUQZrmk 4704
-VPMAXUQZrmkz 4705
-VPMAXUQZrr 4706
-VPMAXUQZrrk 4707
-VPMAXUQZrrkz 4708
-VPMAXUWYrm 4709
-VPMAXUWYrr 4710
-VPMAXUWZ 4711
-VPMAXUWZrm 4712
-VPMAXUWZrmk 4713
-VPMAXUWZrmkz 4714
-VPMAXUWZrr 4715
-VPMAXUWZrrk 4716
-VPMAXUWZrrkz 4717
-VPMAXUWrm 4718
-VPMAXUWrr 4719
-VPMINSBYrm 4720
-VPMINSBYrr 4721
-VPMINSBZ 4722
-VPMINSBZrm 4723
-VPMINSBZrmk 4724
-VPMINSBZrmkz 4725
-VPMINSBZrr 4726
-VPMINSBZrrk 4727
-VPMINSBZrrkz 4728
-VPMINSBrm 4729
-VPMINSBrr 4730
-VPMINSDYrm 4731
-VPMINSDYrr 4732
-VPMINSDZ 4733
-VPMINSDZrm 4734
-VPMINSDZrmb 4735
-VPMINSDZrmbk 4736
-VPMINSDZrmbkz 4737
-VPMINSDZrmk 4738
-VPMINSDZrmkz 4739
-VPMINSDZrr 4740
-VPMINSDZrrk 4741
-VPMINSDZrrkz 4742
-VPMINSDrm 4743
-VPMINSDrr 4744
-VPMINSQZ 4745
-VPMINSQZrm 4746
-VPMINSQZrmb 4747
-VPMINSQZrmbk 4748
-VPMINSQZrmbkz 4749
-VPMINSQZrmk 4750
-VPMINSQZrmkz 4751
-VPMINSQZrr 4752
-VPMINSQZrrk 4753
-VPMINSQZrrkz 4754
-VPMINSWYrm 4755
-VPMINSWYrr 4756
-VPMINSWZ 4757
-VPMINSWZrm 4758
-VPMINSWZrmk 4759
-VPMINSWZrmkz 4760
-VPMINSWZrr 4761
-VPMINSWZrrk 4762
-VPMINSWZrrkz 4763
-VPMINSWrm 4764
-VPMINSWrr 4765
-VPMINUBYrm 4766
-VPMINUBYrr 4767
-VPMINUBZ 4768
-VPMINUBZrm 4769
-VPMINUBZrmk 4770
-VPMINUBZrmkz 4771
-VPMINUBZrr 4772
-VPMINUBZrrk 4773
-VPMINUBZrrkz 4774
-VPMINUBrm 4775
-VPMINUBrr 4776
-VPMINUDYrm 4777
-VPMINUDYrr 4778
-VPMINUDZ 4779
-VPMINUDZrm 4780
-VPMINUDZrmb 4781
-VPMINUDZrmbk 4782
-VPMINUDZrmbkz 4783
-VPMINUDZrmk 4784
-VPMINUDZrmkz 4785
-VPMINUDZrr 4786
-VPMINUDZrrk 4787
-VPMINUDZrrkz 4788
-VPMINUDrm 4789
-VPMINUDrr 4790
-VPMINUQZ 4791
-VPMINUQZrm 4792
-VPMINUQZrmb 4793
-VPMINUQZrmbk 4794
-VPMINUQZrmbkz 4795
-VPMINUQZrmk 4796
-VPMINUQZrmkz 4797
-VPMINUQZrr 4798
-VPMINUQZrrk 4799
-VPMINUQZrrkz 4800
-VPMINUWYrm 4801
-VPMINUWYrr 4802
-VPMINUWZ 4803
-VPMINUWZrm 4804
-VPMINUWZrmk 4805
-VPMINUWZrmkz 4806
-VPMINUWZrr 4807
-VPMINUWZrrk 4808
-VPMINUWZrrkz 4809
-VPMINUWrm 4810
-VPMINUWrr 4811
-VPMOVB 4812
-VPMOVD 4813
-VPMOVDBZ 4814
-VPMOVDBZmr 4815
-VPMOVDBZmrk 4816
-VPMOVDBZrr 4817
-VPMOVDBZrrk 4818
-VPMOVDBZrrkz 4819
-VPMOVDWZ 4820
-VPMOVDWZmr 4821
-VPMOVDWZmrk 4822
-VPMOVDWZrr 4823
-VPMOVDWZrrk 4824
-VPMOVDWZrrkz 4825
-VPMOVM 4826
-VPMOVMSKBYrr 4827
-VPMOVMSKBrr 4828
-VPMOVQ 4829
-VPMOVQBZ 4830
-VPMOVQBZmr 4831
-VPMOVQBZmrk 4832
-VPMOVQBZrr 4833
-VPMOVQBZrrk 4834
-VPMOVQBZrrkz 4835
-VPMOVQDZ 4836
-VPMOVQDZmr 4837
-VPMOVQDZmrk 4838
-VPMOVQDZrr 4839
-VPMOVQDZrrk 4840
-VPMOVQDZrrkz 4841
-VPMOVQWZ 4842
-VPMOVQWZmr 4843
-VPMOVQWZmrk 4844
-VPMOVQWZrr 4845
-VPMOVQWZrrk 4846
-VPMOVQWZrrkz 4847
-VPMOVSDBZ 4848
-VPMOVSDBZmr 4849
-VPMOVSDBZmrk 4850
-VPMOVSDBZrr 4851
-VPMOVSDBZrrk 4852
-VPMOVSDBZrrkz 4853
-VPMOVSDWZ 4854
-VPMOVSDWZmr 4855
-VPMOVSDWZmrk 4856
-VPMOVSDWZrr 4857
-VPMOVSDWZrrk 4858
-VPMOVSDWZrrkz 4859
-VPMOVSQBZ 4860
-VPMOVSQBZmr 4861
-VPMOVSQBZmrk 4862
-VPMOVSQBZrr 4863
-VPMOVSQBZrrk 4864
-VPMOVSQBZrrkz 4865
-VPMOVSQDZ 4866
-VPMOVSQDZmr 4867
-VPMOVSQDZmrk 4868
-VPMOVSQDZrr 4869
-VPMOVSQDZrrk 4870
-VPMOVSQDZrrkz 4871
-VPMOVSQWZ 4872
-VPMOVSQWZmr 4873
-VPMOVSQWZmrk 4874
-VPMOVSQWZrr 4875
-VPMOVSQWZrrk 4876
-VPMOVSQWZrrkz 4877
-VPMOVSWBZ 4878
-VPMOVSWBZmr 4879
-VPMOVSWBZmrk 4880
-VPMOVSWBZrr 4881
-VPMOVSWBZrrk 4882
-VPMOVSWBZrrkz 4883
-VPMOVSXBDYrm 4884
-VPMOVSXBDYrr 4885
-VPMOVSXBDZ 4886
-VPMOVSXBDZrm 4887
-VPMOVSXBDZrmk 4888
-VPMOVSXBDZrmkz 4889
-VPMOVSXBDZrr 4890
-VPMOVSXBDZrrk 4891
-VPMOVSXBDZrrkz 4892
-VPMOVSXBDrm 4893
-VPMOVSXBDrr 4894
-VPMOVSXBQYrm 4895
-VPMOVSXBQYrr 4896
-VPMOVSXBQZ 4897
-VPMOVSXBQZrm 4898
-VPMOVSXBQZrmk 4899
-VPMOVSXBQZrmkz 4900
-VPMOVSXBQZrr 4901
-VPMOVSXBQZrrk 4902
-VPMOVSXBQZrrkz 4903
-VPMOVSXBQrm 4904
-VPMOVSXBQrr 4905
-VPMOVSXBWYrm 4906
-VPMOVSXBWYrr 4907
-VPMOVSXBWZ 4908
-VPMOVSXBWZrm 4909
-VPMOVSXBWZrmk 4910
-VPMOVSXBWZrmkz 4911
-VPMOVSXBWZrr 4912
-VPMOVSXBWZrrk 4913
-VPMOVSXBWZrrkz 4914
-VPMOVSXBWrm 4915
-VPMOVSXBWrr 4916
-VPMOVSXDQYrm 4917
-VPMOVSXDQYrr 4918
-VPMOVSXDQZ 4919
-VPMOVSXDQZrm 4920
-VPMOVSXDQZrmk 4921
-VPMOVSXDQZrmkz 4922
-VPMOVSXDQZrr 4923
-VPMOVSXDQZrrk 4924
-VPMOVSXDQZrrkz 4925
-VPMOVSXDQrm 4926
-VPMOVSXDQrr 4927
-VPMOVSXWDYrm 4928
-VPMOVSXWDYrr 4929
-VPMOVSXWDZ 4930
-VPMOVSXWDZrm 4931
-VPMOVSXWDZrmk 4932
-VPMOVSXWDZrmkz 4933
-VPMOVSXWDZrr 4934
-VPMOVSXWDZrrk 4935
-VPMOVSXWDZrrkz 4936
-VPMOVSXWDrm 4937
-VPMOVSXWDrr 4938
-VPMOVSXWQYrm 4939
-VPMOVSXWQYrr 4940
-VPMOVSXWQZ 4941
-VPMOVSXWQZrm 4942
-VPMOVSXWQZrmk 4943
-VPMOVSXWQZrmkz 4944
-VPMOVSXWQZrr 4945
-VPMOVSXWQZrrk 4946
-VPMOVSXWQZrrkz 4947
-VPMOVSXWQrm 4948
-VPMOVSXWQrr 4949
-VPMOVUSDBZ 4950
-VPMOVUSDBZmr 4951
-VPMOVUSDBZmrk 4952
-VPMOVUSDBZrr 4953
-VPMOVUSDBZrrk 4954
-VPMOVUSDBZrrkz 4955
-VPMOVUSDWZ 4956
-VPMOVUSDWZmr 4957
-VPMOVUSDWZmrk 4958
-VPMOVUSDWZrr 4959
-VPMOVUSDWZrrk 4960
-VPMOVUSDWZrrkz 4961
-VPMOVUSQBZ 4962
-VPMOVUSQBZmr 4963
-VPMOVUSQBZmrk 4964
-VPMOVUSQBZrr 4965
-VPMOVUSQBZrrk 4966
-VPMOVUSQBZrrkz 4967
-VPMOVUSQDZ 4968
-VPMOVUSQDZmr 4969
-VPMOVUSQDZmrk 4970
-VPMOVUSQDZrr 4971
-VPMOVUSQDZrrk 4972
-VPMOVUSQDZrrkz 4973
-VPMOVUSQWZ 4974
-VPMOVUSQWZmr 4975
-VPMOVUSQWZmrk 4976
-VPMOVUSQWZrr 4977
-VPMOVUSQWZrrk 4978
-VPMOVUSQWZrrkz 4979
-VPMOVUSWBZ 4980
-VPMOVUSWBZmr 4981
-VPMOVUSWBZmrk 4982
-VPMOVUSWBZrr 4983
-VPMOVUSWBZrrk 4984
-VPMOVUSWBZrrkz 4985
-VPMOVW 4986
-VPMOVWBZ 4987
-VPMOVWBZmr 4988
-VPMOVWBZmrk 4989
-VPMOVWBZrr 4990
-VPMOVWBZrrk 4991
-VPMOVWBZrrkz 4992
-VPMOVZXBDYrm 4993
-VPMOVZXBDYrr 4994
-VPMOVZXBDZ 4995
-VPMOVZXBDZrm 4996
-VPMOVZXBDZrmk 4997
-VPMOVZXBDZrmkz 4998
-VPMOVZXBDZrr 4999
-VPMOVZXBDZrrk 5000
-VPMOVZXBDZrrkz 5001
-VPMOVZXBDrm 5002
-VPMOVZXBDrr 5003
-VPMOVZXBQYrm 5004
-VPMOVZXBQYrr 5005
-VPMOVZXBQZ 5006
-VPMOVZXBQZrm 5007
-VPMOVZXBQZrmk 5008
-VPMOVZXBQZrmkz 5009
-VPMOVZXBQZrr 5010
-VPMOVZXBQZrrk 5011
-VPMOVZXBQZrrkz 5012
-VPMOVZXBQrm 5013
-VPMOVZXBQrr 5014
-VPMOVZXBWYrm 5015
-VPMOVZXBWYrr 5016
-VPMOVZXBWZ 5017
-VPMOVZXBWZrm 5018
-VPMOVZXBWZrmk 5019
-VPMOVZXBWZrmkz 5020
-VPMOVZXBWZrr 5021
-VPMOVZXBWZrrk 5022
-VPMOVZXBWZrrkz 5023
-VPMOVZXBWrm 5024
-VPMOVZXBWrr 5025
-VPMOVZXDQYrm 5026
-VPMOVZXDQYrr 5027
-VPMOVZXDQZ 5028
-VPMOVZXDQZrm 5029
-VPMOVZXDQZrmk 5030
-VPMOVZXDQZrmkz 5031
-VPMOVZXDQZrr 5032
-VPMOVZXDQZrrk 5033
-VPMOVZXDQZrrkz 5034
-VPMOVZXDQrm 5035
-VPMOVZXDQrr 5036
-VPMOVZXWDYrm 5037
-VPMOVZXWDYrr 5038
-VPMOVZXWDZ 5039
-VPMOVZXWDZrm 5040
-VPMOVZXWDZrmk 5041
-VPMOVZXWDZrmkz 5042
-VPMOVZXWDZrr 5043
-VPMOVZXWDZrrk 5044
-VPMOVZXWDZrrkz 5045
-VPMOVZXWDrm 5046
-VPMOVZXWDrr 5047
-VPMOVZXWQYrm 5048
-VPMOVZXWQYrr 5049
-VPMOVZXWQZ 5050
-VPMOVZXWQZrm 5051
-VPMOVZXWQZrmk 5052
-VPMOVZXWQZrmkz 5053
-VPMOVZXWQZrr 5054
-VPMOVZXWQZrrk 5055
-VPMOVZXWQZrrkz 5056
-VPMOVZXWQrm 5057
-VPMOVZXWQrr 5058
-VPMULDQYrm 5059
-VPMULDQYrr 5060
-VPMULDQZ 5061
-VPMULDQZrm 5062
-VPMULDQZrmb 5063
-VPMULDQZrmbk 5064
-VPMULDQZrmbkz 5065
-VPMULDQZrmk 5066
-VPMULDQZrmkz 5067
-VPMULDQZrr 5068
-VPMULDQZrrk 5069
-VPMULDQZrrkz 5070
-VPMULDQrm 5071
-VPMULDQrr 5072
-VPMULHRSWYrm 5073
-VPMULHRSWYrr 5074
-VPMULHRSWZ 5075
-VPMULHRSWZrm 5076
-VPMULHRSWZrmk 5077
-VPMULHRSWZrmkz 5078
-VPMULHRSWZrr 5079
-VPMULHRSWZrrk 5080
-VPMULHRSWZrrkz 5081
-VPMULHRSWrm 5082
-VPMULHRSWrr 5083
-VPMULHUWYrm 5084
-VPMULHUWYrr 5085
-VPMULHUWZ 5086
-VPMULHUWZrm 5087
-VPMULHUWZrmk 5088
-VPMULHUWZrmkz 5089
-VPMULHUWZrr 5090
-VPMULHUWZrrk 5091
-VPMULHUWZrrkz 5092
-VPMULHUWrm 5093
-VPMULHUWrr 5094
-VPMULHWYrm 5095
-VPMULHWYrr 5096
-VPMULHWZ 5097
-VPMULHWZrm 5098
-VPMULHWZrmk 5099
-VPMULHWZrmkz 5100
-VPMULHWZrr 5101
-VPMULHWZrrk 5102
-VPMULHWZrrkz 5103
-VPMULHWrm 5104
-VPMULHWrr 5105
-VPMULLDYrm 5106
-VPMULLDYrr 5107
-VPMULLDZ 5108
-VPMULLDZrm 5109
-VPMULLDZrmb 5110
-VPMULLDZrmbk 5111
-VPMULLDZrmbkz 5112
-VPMULLDZrmk 5113
-VPMULLDZrmkz 5114
-VPMULLDZrr 5115
-VPMULLDZrrk 5116
-VPMULLDZrrkz 5117
-VPMULLDrm 5118
-VPMULLDrr 5119
-VPMULLQZ 5120
-VPMULLQZrm 5121
-VPMULLQZrmb 5122
-VPMULLQZrmbk 5123
-VPMULLQZrmbkz 5124
-VPMULLQZrmk 5125
-VPMULLQZrmkz 5126
-VPMULLQZrr 5127
-VPMULLQZrrk 5128
-VPMULLQZrrkz 5129
-VPMULLWYrm 5130
-VPMULLWYrr 5131
-VPMULLWZ 5132
-VPMULLWZrm 5133
-VPMULLWZrmk 5134
-VPMULLWZrmkz 5135
-VPMULLWZrr 5136
-VPMULLWZrrk 5137
-VPMULLWZrrkz 5138
-VPMULLWrm 5139
-VPMULLWrr 5140
-VPMULTISHIFTQBZ 5141
-VPMULTISHIFTQBZrm 5142
-VPMULTISHIFTQBZrmb 5143
-VPMULTISHIFTQBZrmbk 5144
-VPMULTISHIFTQBZrmbkz 5145
-VPMULTISHIFTQBZrmk 5146
-VPMULTISHIFTQBZrmkz 5147
-VPMULTISHIFTQBZrr 5148
-VPMULTISHIFTQBZrrk 5149
-VPMULTISHIFTQBZrrkz 5150
-VPMULUDQYrm 5151
-VPMULUDQYrr 5152
-VPMULUDQZ 5153
-VPMULUDQZrm 5154
-VPMULUDQZrmb 5155
-VPMULUDQZrmbk 5156
-VPMULUDQZrmbkz 5157
-VPMULUDQZrmk 5158
-VPMULUDQZrmkz 5159
-VPMULUDQZrr 5160
-VPMULUDQZrrk 5161
-VPMULUDQZrrkz 5162
-VPMULUDQrm 5163
-VPMULUDQrr 5164
-VPOPCNTBZ 5165
-VPOPCNTBZrm 5166
-VPOPCNTBZrmk 5167
-VPOPCNTBZrmkz 5168
-VPOPCNTBZrr 5169
-VPOPCNTBZrrk 5170
-VPOPCNTBZrrkz 5171
-VPOPCNTDZ 5172
-VPOPCNTDZrm 5173
-VPOPCNTDZrmb 5174
-VPOPCNTDZrmbk 5175
-VPOPCNTDZrmbkz 5176
-VPOPCNTDZrmk 5177
-VPOPCNTDZrmkz 5178
-VPOPCNTDZrr 5179
-VPOPCNTDZrrk 5180
-VPOPCNTDZrrkz 5181
-VPOPCNTQZ 5182
-VPOPCNTQZrm 5183
-VPOPCNTQZrmb 5184
-VPOPCNTQZrmbk 5185
-VPOPCNTQZrmbkz 5186
-VPOPCNTQZrmk 5187
-VPOPCNTQZrmkz 5188
-VPOPCNTQZrr 5189
-VPOPCNTQZrrk 5190
-VPOPCNTQZrrkz 5191
-VPOPCNTWZ 5192
-VPOPCNTWZrm 5193
-VPOPCNTWZrmk 5194
-VPOPCNTWZrmkz 5195
-VPOPCNTWZrr 5196
-VPOPCNTWZrrk 5197
-VPOPCNTWZrrkz 5198
-VPORDZ 5199
-VPORDZrm 5200
-VPORDZrmb 5201
-VPORDZrmbk 5202
-VPORDZrmbkz 5203
-VPORDZrmk 5204
-VPORDZrmkz 5205
-VPORDZrr 5206
-VPORDZrrk 5207
-VPORDZrrkz 5208
-VPORQZ 5209
-VPORQZrm 5210
-VPORQZrmb 5211
-VPORQZrmbk 5212
-VPORQZrmbkz 5213
-VPORQZrmk 5214
-VPORQZrmkz 5215
-VPORQZrr 5216
-VPORQZrrk 5217
-VPORQZrrkz 5218
-VPORYrm 5219
-VPORYrr 5220
-VPORrm 5221
-VPORrr 5222
-VPPERMrmr 5223
-VPPERMrrm 5224
-VPPERMrrr 5225
-VPPERMrrr_REV 5226
-VPROLDZ 5227
-VPROLDZmbi 5228
-VPROLDZmbik 5229
-VPROLDZmbikz 5230
-VPROLDZmi 5231
-VPROLDZmik 5232
-VPROLDZmikz 5233
-VPROLDZri 5234
-VPROLDZrik 5235
-VPROLDZrikz 5236
-VPROLQZ 5237
-VPROLQZmbi 5238
-VPROLQZmbik 5239
-VPROLQZmbikz 5240
-VPROLQZmi 5241
-VPROLQZmik 5242
-VPROLQZmikz 5243
-VPROLQZri 5244
-VPROLQZrik 5245
-VPROLQZrikz 5246
-VPROLVDZ 5247
-VPROLVDZrm 5248
-VPROLVDZrmb 5249
-VPROLVDZrmbk 5250
-VPROLVDZrmbkz 5251
-VPROLVDZrmk 5252
-VPROLVDZrmkz 5253
-VPROLVDZrr 5254
-VPROLVDZrrk 5255
-VPROLVDZrrkz 5256
-VPROLVQZ 5257
-VPROLVQZrm 5258
-VPROLVQZrmb 5259
-VPROLVQZrmbk 5260
-VPROLVQZrmbkz 5261
-VPROLVQZrmk 5262
-VPROLVQZrmkz 5263
-VPROLVQZrr 5264
-VPROLVQZrrk 5265
-VPROLVQZrrkz 5266
-VPRORDZ 5267
-VPRORDZmbi 5268
-VPRORDZmbik 5269
-VPRORDZmbikz 5270
-VPRORDZmi 5271
-VPRORDZmik 5272
-VPRORDZmikz 5273
-VPRORDZri 5274
-VPRORDZrik 5275
-VPRORDZrikz 5276
-VPRORQZ 5277
-VPRORQZmbi 5278
-VPRORQZmbik 5279
-VPRORQZmbikz 5280
-VPRORQZmi 5281
-VPRORQZmik 5282
-VPRORQZmikz 5283
-VPRORQZri 5284
-VPRORQZrik 5285
-VPRORQZrikz 5286
-VPRORVDZ 5287
-VPRORVDZrm 5288
-VPRORVDZrmb 5289
-VPRORVDZrmbk 5290
-VPRORVDZrmbkz 5291
-VPRORVDZrmk 5292
-VPRORVDZrmkz 5293
-VPRORVDZrr 5294
-VPRORVDZrrk 5295
-VPRORVDZrrkz 5296
-VPRORVQZ 5297
-VPRORVQZrm 5298
-VPRORVQZrmb 5299
-VPRORVQZrmbk 5300
-VPRORVQZrmbkz 5301
-VPRORVQZrmk 5302
-VPRORVQZrmkz 5303
-VPRORVQZrr 5304
-VPRORVQZrrk 5305
-VPRORVQZrrkz 5306
-VPROTBmi 5307
-VPROTBmr 5308
-VPROTBri 5309
-VPROTBrm 5310
-VPROTBrr 5311
-VPROTBrr_REV 5312
-VPROTDmi 5313
-VPROTDmr 5314
-VPROTDri 5315
-VPROTDrm 5316
-VPROTDrr 5317
-VPROTDrr_REV 5318
-VPROTQmi 5319
-VPROTQmr 5320
-VPROTQri 5321
-VPROTQrm 5322
-VPROTQrr 5323
-VPROTQrr_REV 5324
-VPROTWmi 5325
-VPROTWmr 5326
-VPROTWri 5327
-VPROTWrm 5328
-VPROTWrr 5329
-VPROTWrr_REV 5330
-VPSADBWYrm 5331
-VPSADBWYrr 5332
-VPSADBWZ 5333
-VPSADBWZrm 5334
-VPSADBWZrr 5335
-VPSADBWrm 5336
-VPSADBWrr 5337
-VPSCATTERDDZ 5338
-VPSCATTERDDZmr 5339
-VPSCATTERDQZ 5340
-VPSCATTERDQZmr 5341
-VPSCATTERQDZ 5342
-VPSCATTERQDZmr 5343
-VPSCATTERQQZ 5344
-VPSCATTERQQZmr 5345
-VPSHABmr 5346
-VPSHABrm 5347
-VPSHABrr 5348
-VPSHABrr_REV 5349
-VPSHADmr 5350
-VPSHADrm 5351
-VPSHADrr 5352
-VPSHADrr_REV 5353
-VPSHAQmr 5354
-VPSHAQrm 5355
-VPSHAQrr 5356
-VPSHAQrr_REV 5357
-VPSHAWmr 5358
-VPSHAWrm 5359
-VPSHAWrr 5360
-VPSHAWrr_REV 5361
-VPSHLBmr 5362
-VPSHLBrm 5363
-VPSHLBrr 5364
-VPSHLBrr_REV 5365
-VPSHLDDZ 5366
-VPSHLDDZrmbi 5367
-VPSHLDDZrmbik 5368
-VPSHLDDZrmbikz 5369
-VPSHLDDZrmi 5370
-VPSHLDDZrmik 5371
-VPSHLDDZrmikz 5372
-VPSHLDDZrri 5373
-VPSHLDDZrrik 5374
-VPSHLDDZrrikz 5375
-VPSHLDQZ 5376
-VPSHLDQZrmbi 5377
-VPSHLDQZrmbik 5378
-VPSHLDQZrmbikz 5379
-VPSHLDQZrmi 5380
-VPSHLDQZrmik 5381
-VPSHLDQZrmikz 5382
-VPSHLDQZrri 5383
-VPSHLDQZrrik 5384
-VPSHLDQZrrikz 5385
-VPSHLDVDZ 5386
-VPSHLDVDZm 5387
-VPSHLDVDZmb 5388
-VPSHLDVDZmbk 5389
-VPSHLDVDZmbkz 5390
-VPSHLDVDZmk 5391
-VPSHLDVDZmkz 5392
-VPSHLDVDZr 5393
-VPSHLDVDZrk 5394
-VPSHLDVDZrkz 5395
-VPSHLDVQZ 5396
-VPSHLDVQZm 5397
-VPSHLDVQZmb 5398
-VPSHLDVQZmbk 5399
-VPSHLDVQZmbkz 5400
-VPSHLDVQZmk 5401
-VPSHLDVQZmkz 5402
-VPSHLDVQZr 5403
-VPSHLDVQZrk 5404
-VPSHLDVQZrkz 5405
-VPSHLDVWZ 5406
-VPSHLDVWZm 5407
-VPSHLDVWZmk 5408
-VPSHLDVWZmkz 5409
-VPSHLDVWZr 5410
-VPSHLDVWZrk 5411
-VPSHLDVWZrkz 5412
-VPSHLDWZ 5413
-VPSHLDWZrmi 5414
-VPSHLDWZrmik 5415
-VPSHLDWZrmikz 5416
-VPSHLDWZrri 5417
-VPSHLDWZrrik 5418
-VPSHLDWZrrikz 5419
-VPSHLDmr 5420
-VPSHLDrm 5421
-VPSHLDrr 5422
-VPSHLDrr_REV 5423
-VPSHLQmr 5424
-VPSHLQrm 5425
-VPSHLQrr 5426
-VPSHLQrr_REV 5427
-VPSHLWmr 5428
-VPSHLWrm 5429
-VPSHLWrr 5430
-VPSHLWrr_REV 5431
-VPSHRDDZ 5432
-VPSHRDDZrmbi 5433
-VPSHRDDZrmbik 5434
-VPSHRDDZrmbikz 5435
-VPSHRDDZrmi 5436
-VPSHRDDZrmik 5437
-VPSHRDDZrmikz 5438
-VPSHRDDZrri 5439
-VPSHRDDZrrik 5440
-VPSHRDDZrrikz 5441
-VPSHRDQZ 5442
-VPSHRDQZrmbi 5443
-VPSHRDQZrmbik 5444
-VPSHRDQZrmbikz 5445
-VPSHRDQZrmi 5446
-VPSHRDQZrmik 5447
-VPSHRDQZrmikz 5448
-VPSHRDQZrri 5449
-VPSHRDQZrrik 5450
-VPSHRDQZrrikz 5451
-VPSHRDVDZ 5452
-VPSHRDVDZm 5453
-VPSHRDVDZmb 5454
-VPSHRDVDZmbk 5455
-VPSHRDVDZmbkz 5456
-VPSHRDVDZmk 5457
-VPSHRDVDZmkz 5458
-VPSHRDVDZr 5459
-VPSHRDVDZrk 5460
-VPSHRDVDZrkz 5461
-VPSHRDVQZ 5462
-VPSHRDVQZm 5463
-VPSHRDVQZmb 5464
-VPSHRDVQZmbk 5465
-VPSHRDVQZmbkz 5466
-VPSHRDVQZmk 5467
-VPSHRDVQZmkz 5468
-VPSHRDVQZr 5469
-VPSHRDVQZrk 5470
-VPSHRDVQZrkz 5471
-VPSHRDVWZ 5472
-VPSHRDVWZm 5473
-VPSHRDVWZmk 5474
-VPSHRDVWZmkz 5475
-VPSHRDVWZr 5476
-VPSHRDVWZrk 5477
-VPSHRDVWZrkz 5478
-VPSHRDWZ 5479
-VPSHRDWZrmi 5480
-VPSHRDWZrmik 5481
-VPSHRDWZrmikz 5482
-VPSHRDWZrri 5483
-VPSHRDWZrrik 5484
-VPSHRDWZrrikz 5485
-VPSHUFBITQMBZ 5486
-VPSHUFBITQMBZrm 5487
-VPSHUFBITQMBZrmk 5488
-VPSHUFBITQMBZrr 5489
-VPSHUFBITQMBZrrk 5490
-VPSHUFBYrm 5491
-VPSHUFBYrr 5492
-VPSHUFBZ 5493
-VPSHUFBZrm 5494
-VPSHUFBZrmk 5495
-VPSHUFBZrmkz 5496
-VPSHUFBZrr 5497
-VPSHUFBZrrk 5498
-VPSHUFBZrrkz 5499
-VPSHUFBrm 5500
-VPSHUFBrr 5501
-VPSHUFDYmi 5502
-VPSHUFDYri 5503
-VPSHUFDZ 5504
-VPSHUFDZmbi 5505
-VPSHUFDZmbik 5506
-VPSHUFDZmbikz 5507
-VPSHUFDZmi 5508
-VPSHUFDZmik 5509
-VPSHUFDZmikz 5510
-VPSHUFDZri 5511
-VPSHUFDZrik 5512
-VPSHUFDZrikz 5513
-VPSHUFDmi 5514
-VPSHUFDri 5515
-VPSHUFHWYmi 5516
-VPSHUFHWYri 5517
-VPSHUFHWZ 5518
-VPSHUFHWZmi 5519
-VPSHUFHWZmik 5520
-VPSHUFHWZmikz 5521
-VPSHUFHWZri 5522
-VPSHUFHWZrik 5523
-VPSHUFHWZrikz 5524
-VPSHUFHWmi 5525
-VPSHUFHWri 5526
-VPSHUFLWYmi 5527
-VPSHUFLWYri 5528
-VPSHUFLWZ 5529
-VPSHUFLWZmi 5530
-VPSHUFLWZmik 5531
-VPSHUFLWZmikz 5532
-VPSHUFLWZri 5533
-VPSHUFLWZrik 5534
-VPSHUFLWZrikz 5535
-VPSHUFLWmi 5536
-VPSHUFLWri 5537
-VPSIGNBYrm 5538
-VPSIGNBYrr 5539
-VPSIGNBrm 5540
-VPSIGNBrr 5541
-VPSIGNDYrm 5542
-VPSIGNDYrr 5543
-VPSIGNDrm 5544
-VPSIGNDrr 5545
-VPSIGNWYrm 5546
-VPSIGNWYrr 5547
-VPSIGNWrm 5548
-VPSIGNWrr 5549
-VPSLLDQYri 5550
-VPSLLDQZ 5551
-VPSLLDQZmi 5552
-VPSLLDQZri 5553
-VPSLLDQri 5554
-VPSLLDYri 5555
-VPSLLDYrm 5556
-VPSLLDYrr 5557
-VPSLLDZ 5558
-VPSLLDZmbi 5559
-VPSLLDZmbik 5560
-VPSLLDZmbikz 5561
-VPSLLDZmi 5562
-VPSLLDZmik 5563
-VPSLLDZmikz 5564
-VPSLLDZri 5565
-VPSLLDZrik 5566
-VPSLLDZrikz 5567
-VPSLLDZrm 5568
-VPSLLDZrmk 5569
-VPSLLDZrmkz 5570
-VPSLLDZrr 5571
-VPSLLDZrrk 5572
-VPSLLDZrrkz 5573
-VPSLLDri 5574
-VPSLLDrm 5575
-VPSLLDrr 5576
-VPSLLQYri 5577
-VPSLLQYrm 5578
-VPSLLQYrr 5579
-VPSLLQZ 5580
-VPSLLQZmbi 5581
-VPSLLQZmbik 5582
-VPSLLQZmbikz 5583
-VPSLLQZmi 5584
-VPSLLQZmik 5585
-VPSLLQZmikz 5586
-VPSLLQZri 5587
-VPSLLQZrik 5588
-VPSLLQZrikz 5589
-VPSLLQZrm 5590
-VPSLLQZrmk 5591
-VPSLLQZrmkz 5592
-VPSLLQZrr 5593
-VPSLLQZrrk 5594
-VPSLLQZrrkz 5595
-VPSLLQri 5596
-VPSLLQrm 5597
-VPSLLQrr 5598
-VPSLLVDYrm 5599
-VPSLLVDYrr 5600
-VPSLLVDZ 5601
-VPSLLVDZrm 5602
-VPSLLVDZrmb 5603
-VPSLLVDZrmbk 5604
-VPSLLVDZrmbkz 5605
-VPSLLVDZrmk 5606
-VPSLLVDZrmkz 5607
-VPSLLVDZrr 5608
-VPSLLVDZrrk 5609
-VPSLLVDZrrkz 5610
-VPSLLVDrm 5611
-VPSLLVDrr 5612
-VPSLLVQYrm 5613
-VPSLLVQYrr 5614
-VPSLLVQZ 5615
-VPSLLVQZrm 5616
-VPSLLVQZrmb 5617
-VPSLLVQZrmbk 5618
-VPSLLVQZrmbkz 5619
-VPSLLVQZrmk 5620
-VPSLLVQZrmkz 5621
-VPSLLVQZrr 5622
-VPSLLVQZrrk 5623
-VPSLLVQZrrkz 5624
-VPSLLVQrm 5625
-VPSLLVQrr 5626
-VPSLLVWZ 5627
-VPSLLVWZrm 5628
-VPSLLVWZrmk 5629
-VPSLLVWZrmkz 5630
-VPSLLVWZrr 5631
-VPSLLVWZrrk 5632
-VPSLLVWZrrkz 5633
-VPSLLWYri 5634
-VPSLLWYrm 5635
-VPSLLWYrr 5636
-VPSLLWZ 5637
-VPSLLWZmi 5638
-VPSLLWZmik 5639
-VPSLLWZmikz 5640
-VPSLLWZri 5641
-VPSLLWZrik 5642
-VPSLLWZrikz 5643
-VPSLLWZrm 5644
-VPSLLWZrmk 5645
-VPSLLWZrmkz 5646
-VPSLLWZrr 5647
-VPSLLWZrrk 5648
-VPSLLWZrrkz 5649
-VPSLLWri 5650
-VPSLLWrm 5651
-VPSLLWrr 5652
-VPSRADYri 5653
-VPSRADYrm 5654
-VPSRADYrr 5655
-VPSRADZ 5656
-VPSRADZmbi 5657
-VPSRADZmbik 5658
-VPSRADZmbikz 5659
-VPSRADZmi 5660
-VPSRADZmik 5661
-VPSRADZmikz 5662
-VPSRADZri 5663
-VPSRADZrik 5664
-VPSRADZrikz 5665
-VPSRADZrm 5666
-VPSRADZrmk 5667
-VPSRADZrmkz 5668
-VPSRADZrr 5669
-VPSRADZrrk 5670
-VPSRADZrrkz 5671
-VPSRADri 5672
-VPSRADrm 5673
-VPSRADrr 5674
-VPSRAQZ 5675
-VPSRAQZmbi 5676
-VPSRAQZmbik 5677
-VPSRAQZmbikz 5678
-VPSRAQZmi 5679
-VPSRAQZmik 5680
-VPSRAQZmikz 5681
-VPSRAQZri 5682
-VPSRAQZrik 5683
-VPSRAQZrikz 5684
-VPSRAQZrm 5685
-VPSRAQZrmk 5686
-VPSRAQZrmkz 5687
-VPSRAQZrr 5688
-VPSRAQZrrk 5689
-VPSRAQZrrkz 5690
-VPSRAVDYrm 5691
-VPSRAVDYrr 5692
-VPSRAVDZ 5693
-VPSRAVDZrm 5694
-VPSRAVDZrmb 5695
-VPSRAVDZrmbk 5696
-VPSRAVDZrmbkz 5697
-VPSRAVDZrmk 5698
-VPSRAVDZrmkz 5699
-VPSRAVDZrr 5700
-VPSRAVDZrrk 5701
-VPSRAVDZrrkz 5702
-VPSRAVDrm 5703
-VPSRAVDrr 5704
-VPSRAVQZ 5705
-VPSRAVQZrm 5706
-VPSRAVQZrmb 5707
-VPSRAVQZrmbk 5708
-VPSRAVQZrmbkz 5709
-VPSRAVQZrmk 5710
-VPSRAVQZrmkz 5711
-VPSRAVQZrr 5712
-VPSRAVQZrrk 5713
-VPSRAVQZrrkz 5714
-VPSRAVWZ 5715
-VPSRAVWZrm 5716
-VPSRAVWZrmk 5717
-VPSRAVWZrmkz 5718
-VPSRAVWZrr 5719
-VPSRAVWZrrk 5720
-VPSRAVWZrrkz 5721
-VPSRAWYri 5722
-VPSRAWYrm 5723
-VPSRAWYrr 5724
-VPSRAWZ 5725
-VPSRAWZmi 5726
-VPSRAWZmik 5727
-VPSRAWZmikz 5728
-VPSRAWZri 5729
-VPSRAWZrik 5730
-VPSRAWZrikz 5731
-VPSRAWZrm 5732
-VPSRAWZrmk 5733
-VPSRAWZrmkz 5734
-VPSRAWZrr 5735
-VPSRAWZrrk 5736
-VPSRAWZrrkz 5737
-VPSRAWri 5738
-VPSRAWrm 5739
-VPSRAWrr 5740
-VPSRLDQYri 5741
-VPSRLDQZ 5742
-VPSRLDQZmi 5743
-VPSRLDQZri 5744
-VPSRLDQri 5745
-VPSRLDYri 5746
-VPSRLDYrm 5747
-VPSRLDYrr 5748
-VPSRLDZ 5749
-VPSRLDZmbi 5750
-VPSRLDZmbik 5751
-VPSRLDZmbikz 5752
-VPSRLDZmi 5753
-VPSRLDZmik 5754
-VPSRLDZmikz 5755
-VPSRLDZri 5756
-VPSRLDZrik 5757
-VPSRLDZrikz 5758
-VPSRLDZrm 5759
-VPSRLDZrmk 5760
-VPSRLDZrmkz 5761
-VPSRLDZrr 5762
-VPSRLDZrrk 5763
-VPSRLDZrrkz 5764
-VPSRLDri 5765
-VPSRLDrm 5766
-VPSRLDrr 5767
-VPSRLQYri 5768
-VPSRLQYrm 5769
-VPSRLQYrr 5770
-VPSRLQZ 5771
-VPSRLQZmbi 5772
-VPSRLQZmbik 5773
-VPSRLQZmbikz 5774
-VPSRLQZmi 5775
-VPSRLQZmik 5776
-VPSRLQZmikz 5777
-VPSRLQZri 5778
-VPSRLQZrik 5779
-VPSRLQZrikz 5780
-VPSRLQZrm 5781
-VPSRLQZrmk 5782
-VPSRLQZrmkz 5783
-VPSRLQZrr 5784
-VPSRLQZrrk 5785
-VPSRLQZrrkz 5786
-VPSRLQri 5787
-VPSRLQrm 5788
-VPSRLQrr 5789
-VPSRLVDYrm 5790
-VPSRLVDYrr 5791
-VPSRLVDZ 5792
-VPSRLVDZrm 5793
-VPSRLVDZrmb 5794
-VPSRLVDZrmbk 5795
-VPSRLVDZrmbkz 5796
-VPSRLVDZrmk 5797
-VPSRLVDZrmkz 5798
-VPSRLVDZrr 5799
-VPSRLVDZrrk 5800
-VPSRLVDZrrkz 5801
-VPSRLVDrm 5802
-VPSRLVDrr 5803
-VPSRLVQYrm 5804
-VPSRLVQYrr 5805
-VPSRLVQZ 5806
-VPSRLVQZrm 5807
-VPSRLVQZrmb 5808
-VPSRLVQZrmbk 5809
-VPSRLVQZrmbkz 5810
-VPSRLVQZrmk 5811
-VPSRLVQZrmkz 5812
-VPSRLVQZrr 5813
-VPSRLVQZrrk 5814
-VPSRLVQZrrkz 5815
-VPSRLVQrm 5816
-VPSRLVQrr 5817
-VPSRLVWZ 5818
-VPSRLVWZrm 5819
-VPSRLVWZrmk 5820
-VPSRLVWZrmkz 5821
-VPSRLVWZrr 5822
-VPSRLVWZrrk 5823
-VPSRLVWZrrkz 5824
-VPSRLWYri 5825
-VPSRLWYrm 5826
-VPSRLWYrr 5827
-VPSRLWZ 5828
-VPSRLWZmi 5829
-VPSRLWZmik 5830
-VPSRLWZmikz 5831
-VPSRLWZri 5832
-VPSRLWZrik 5833
-VPSRLWZrikz 5834
-VPSRLWZrm 5835
-VPSRLWZrmk 5836
-VPSRLWZrmkz 5837
-VPSRLWZrr 5838
-VPSRLWZrrk 5839
-VPSRLWZrrkz 5840
-VPSRLWri 5841
-VPSRLWrm 5842
-VPSRLWrr 5843
-VPSUBBYrm 5844
-VPSUBBYrr 5845
-VPSUBBZ 5846
-VPSUBBZrm 5847
-VPSUBBZrmk 5848
-VPSUBBZrmkz 5849
-VPSUBBZrr 5850
-VPSUBBZrrk 5851
-VPSUBBZrrkz 5852
-VPSUBBrm 5853
-VPSUBBrr 5854
-VPSUBDYrm 5855
-VPSUBDYrr 5856
-VPSUBDZ 5857
-VPSUBDZrm 5858
-VPSUBDZrmb 5859
-VPSUBDZrmbk 5860
-VPSUBDZrmbkz 5861
-VPSUBDZrmk 5862
-VPSUBDZrmkz 5863
-VPSUBDZrr 5864
-VPSUBDZrrk 5865
-VPSUBDZrrkz 5866
-VPSUBDrm 5867
-VPSUBDrr 5868
-VPSUBQYrm 5869
-VPSUBQYrr 5870
-VPSUBQZ 5871
-VPSUBQZrm 5872
-VPSUBQZrmb 5873
-VPSUBQZrmbk 5874
-VPSUBQZrmbkz 5875
-VPSUBQZrmk 5876
-VPSUBQZrmkz 5877
-VPSUBQZrr 5878
-VPSUBQZrrk 5879
-VPSUBQZrrkz 5880
-VPSUBQrm 5881
-VPSUBQrr 5882
-VPSUBSBYrm 5883
-VPSUBSBYrr 5884
-VPSUBSBZ 5885
-VPSUBSBZrm 5886
-VPSUBSBZrmk 5887
-VPSUBSBZrmkz 5888
-VPSUBSBZrr 5889
-VPSUBSBZrrk 5890
-VPSUBSBZrrkz 5891
-VPSUBSBrm 5892
-VPSUBSBrr 5893
-VPSUBSWYrm 5894
-VPSUBSWYrr 5895
-VPSUBSWZ 5896
-VPSUBSWZrm 5897
-VPSUBSWZrmk 5898
-VPSUBSWZrmkz 5899
-VPSUBSWZrr 5900
-VPSUBSWZrrk 5901
-VPSUBSWZrrkz 5902
-VPSUBSWrm 5903
-VPSUBSWrr 5904
-VPSUBUSBYrm 5905
-VPSUBUSBYrr 5906
-VPSUBUSBZ 5907
-VPSUBUSBZrm 5908
-VPSUBUSBZrmk 5909
-VPSUBUSBZrmkz 5910
-VPSUBUSBZrr 5911
-VPSUBUSBZrrk 5912
-VPSUBUSBZrrkz 5913
-VPSUBUSBrm 5914
-VPSUBUSBrr 5915
-VPSUBUSWYrm 5916
-VPSUBUSWYrr 5917
-VPSUBUSWZ 5918
-VPSUBUSWZrm 5919
-VPSUBUSWZrmk 5920
-VPSUBUSWZrmkz 5921
-VPSUBUSWZrr 5922
-VPSUBUSWZrrk 5923
-VPSUBUSWZrrkz 5924
-VPSUBUSWrm 5925
-VPSUBUSWrr 5926
-VPSUBWYrm 5927
-VPSUBWYrr 5928
-VPSUBWZ 5929
-VPSUBWZrm 5930
-VPSUBWZrmk 5931
-VPSUBWZrmkz 5932
-VPSUBWZrr 5933
-VPSUBWZrrk 5934
-VPSUBWZrrkz 5935
-VPSUBWrm 5936
-VPSUBWrr 5937
-VPTERNLOGDZ 5938
-VPTERNLOGDZrmbi 5939
-VPTERNLOGDZrmbik 5940
-VPTERNLOGDZrmbikz 5941
-VPTERNLOGDZrmi 5942
-VPTERNLOGDZrmik 5943
-VPTERNLOGDZrmikz 5944
-VPTERNLOGDZrri 5945
-VPTERNLOGDZrrik 5946
-VPTERNLOGDZrrikz 5947
-VPTERNLOGQZ 5948
-VPTERNLOGQZrmbi 5949
-VPTERNLOGQZrmbik 5950
-VPTERNLOGQZrmbikz 5951
-VPTERNLOGQZrmi 5952
-VPTERNLOGQZrmik 5953
-VPTERNLOGQZrmikz 5954
-VPTERNLOGQZrri 5955
-VPTERNLOGQZrrik 5956
-VPTERNLOGQZrrikz 5957
-VPTESTMBZ 5958
-VPTESTMBZrm 5959
-VPTESTMBZrmk 5960
-VPTESTMBZrr 5961
-VPTESTMBZrrk 5962
-VPTESTMDZ 5963
-VPTESTMDZrm 5964
-VPTESTMDZrmb 5965
-VPTESTMDZrmbk 5966
-VPTESTMDZrmk 5967
-VPTESTMDZrr 5968
-VPTESTMDZrrk 5969
-VPTESTMQZ 5970
-VPTESTMQZrm 5971
-VPTESTMQZrmb 5972
-VPTESTMQZrmbk 5973
-VPTESTMQZrmk 5974
-VPTESTMQZrr 5975
-VPTESTMQZrrk 5976
-VPTESTMWZ 5977
-VPTESTMWZrm 5978
-VPTESTMWZrmk 5979
-VPTESTMWZrr 5980
-VPTESTMWZrrk 5981
-VPTESTNMBZ 5982
-VPTESTNMBZrm 5983
-VPTESTNMBZrmk 5984
-VPTESTNMBZrr 5985
-VPTESTNMBZrrk 5986
-VPTESTNMDZ 5987
-VPTESTNMDZrm 5988
-VPTESTNMDZrmb 5989
-VPTESTNMDZrmbk 5990
-VPTESTNMDZrmk 5991
-VPTESTNMDZrr 5992
-VPTESTNMDZrrk 5993
-VPTESTNMQZ 5994
-VPTESTNMQZrm 5995
-VPTESTNMQZrmb 5996
-VPTESTNMQZrmbk 5997
-VPTESTNMQZrmk 5998
-VPTESTNMQZrr 5999
-VPTESTNMQZrrk 6000
-VPTESTNMWZ 6001
-VPTESTNMWZrm 6002
-VPTESTNMWZrmk 6003
-VPTESTNMWZrr 6004
-VPTESTNMWZrrk 6005
-VPTESTYrm 6006
-VPTESTYrr 6007
-VPTESTrm 6008
-VPTESTrr 6009
-VPUNPCKHBWYrm 6010
-VPUNPCKHBWYrr 6011
-VPUNPCKHBWZ 6012
-VPUNPCKHBWZrm 6013
-VPUNPCKHBWZrmk 6014
-VPUNPCKHBWZrmkz 6015
-VPUNPCKHBWZrr 6016
-VPUNPCKHBWZrrk 6017
-VPUNPCKHBWZrrkz 6018
-VPUNPCKHBWrm 6019
-VPUNPCKHBWrr 6020
-VPUNPCKHDQYrm 6021
-VPUNPCKHDQYrr 6022
-VPUNPCKHDQZ 6023
-VPUNPCKHDQZrm 6024
-VPUNPCKHDQZrmb 6025
-VPUNPCKHDQZrmbk 6026
-VPUNPCKHDQZrmbkz 6027
-VPUNPCKHDQZrmk 6028
-VPUNPCKHDQZrmkz 6029
-VPUNPCKHDQZrr 6030
-VPUNPCKHDQZrrk 6031
-VPUNPCKHDQZrrkz 6032
-VPUNPCKHDQrm 6033
-VPUNPCKHDQrr 6034
-VPUNPCKHQDQYrm 6035
-VPUNPCKHQDQYrr 6036
-VPUNPCKHQDQZ 6037
-VPUNPCKHQDQZrm 6038
-VPUNPCKHQDQZrmb 6039
-VPUNPCKHQDQZrmbk 6040
-VPUNPCKHQDQZrmbkz 6041
-VPUNPCKHQDQZrmk 6042
-VPUNPCKHQDQZrmkz 6043
-VPUNPCKHQDQZrr 6044
-VPUNPCKHQDQZrrk 6045
-VPUNPCKHQDQZrrkz 6046
-VPUNPCKHQDQrm 6047
-VPUNPCKHQDQrr 6048
-VPUNPCKHWDYrm 6049
-VPUNPCKHWDYrr 6050
-VPUNPCKHWDZ 6051
-VPUNPCKHWDZrm 6052
-VPUNPCKHWDZrmk 6053
-VPUNPCKHWDZrmkz 6054
-VPUNPCKHWDZrr 6055
-VPUNPCKHWDZrrk 6056
-VPUNPCKHWDZrrkz 6057
-VPUNPCKHWDrm 6058
-VPUNPCKHWDrr 6059
-VPUNPCKLBWYrm 6060
-VPUNPCKLBWYrr 6061
-VPUNPCKLBWZ 6062
-VPUNPCKLBWZrm 6063
-VPUNPCKLBWZrmk 6064
-VPUNPCKLBWZrmkz 6065
-VPUNPCKLBWZrr 6066
-VPUNPCKLBWZrrk 6067
-VPUNPCKLBWZrrkz 6068
-VPUNPCKLBWrm 6069
-VPUNPCKLBWrr 6070
-VPUNPCKLDQYrm 6071
-VPUNPCKLDQYrr 6072
-VPUNPCKLDQZ 6073
-VPUNPCKLDQZrm 6074
-VPUNPCKLDQZrmb 6075
-VPUNPCKLDQZrmbk 6076
-VPUNPCKLDQZrmbkz 6077
-VPUNPCKLDQZrmk 6078
-VPUNPCKLDQZrmkz 6079
-VPUNPCKLDQZrr 6080
-VPUNPCKLDQZrrk 6081
-VPUNPCKLDQZrrkz 6082
-VPUNPCKLDQrm 6083
-VPUNPCKLDQrr 6084
-VPUNPCKLQDQYrm 6085
-VPUNPCKLQDQYrr 6086
-VPUNPCKLQDQZ 6087
-VPUNPCKLQDQZrm 6088
-VPUNPCKLQDQZrmb 6089
-VPUNPCKLQDQZrmbk 6090
-VPUNPCKLQDQZrmbkz 6091
-VPUNPCKLQDQZrmk 6092
-VPUNPCKLQDQZrmkz 6093
-VPUNPCKLQDQZrr 6094
-VPUNPCKLQDQZrrk 6095
-VPUNPCKLQDQZrrkz 6096
-VPUNPCKLQDQrm 6097
-VPUNPCKLQDQrr 6098
-VPUNPCKLWDYrm 6099
-VPUNPCKLWDYrr 6100
-VPUNPCKLWDZ 6101
-VPUNPCKLWDZrm 6102
-VPUNPCKLWDZrmk 6103
-VPUNPCKLWDZrmkz 6104
-VPUNPCKLWDZrr 6105
-VPUNPCKLWDZrrk 6106
-VPUNPCKLWDZrrkz 6107
-VPUNPCKLWDrm 6108
-VPUNPCKLWDrr 6109
-VPXORDZ 6110
-VPXORDZrm 6111
-VPXORDZrmb 6112
-VPXORDZrmbk 6113
-VPXORDZrmbkz 6114
-VPXORDZrmk 6115
-VPXORDZrmkz 6116
-VPXORDZrr 6117
-VPXORDZrrk 6118
-VPXORDZrrkz 6119
-VPXORQZ 6120
-VPXORQZrm 6121
-VPXORQZrmb 6122
-VPXORQZrmbk 6123
-VPXORQZrmbkz 6124
-VPXORQZrmk 6125
-VPXORQZrmkz 6126
-VPXORQZrr 6127
-VPXORQZrrk 6128
-VPXORQZrrkz 6129
-VPXORYrm 6130
-VPXORYrr 6131
-VPXORrm 6132
-VPXORrr 6133
-VRANGEPDZ 6134
-VRANGEPDZrmbi 6135
-VRANGEPDZrmbik 6136
-VRANGEPDZrmbikz 6137
-VRANGEPDZrmi 6138
-VRANGEPDZrmik 6139
-VRANGEPDZrmikz 6140
-VRANGEPDZrri 6141
-VRANGEPDZrrib 6142
-VRANGEPDZrribk 6143
-VRANGEPDZrribkz 6144
-VRANGEPDZrrik 6145
-VRANGEPDZrrikz 6146
-VRANGEPSZ 6147
-VRANGEPSZrmbi 6148
-VRANGEPSZrmbik 6149
-VRANGEPSZrmbikz 6150
-VRANGEPSZrmi 6151
-VRANGEPSZrmik 6152
-VRANGEPSZrmikz 6153
-VRANGEPSZrri 6154
-VRANGEPSZrrib 6155
-VRANGEPSZrribk 6156
-VRANGEPSZrribkz 6157
-VRANGEPSZrrik 6158
-VRANGEPSZrrikz 6159
-VRANGESDZrmi 6160
-VRANGESDZrmik 6161
-VRANGESDZrmikz 6162
-VRANGESDZrri 6163
-VRANGESDZrrib 6164
-VRANGESDZrribk 6165
-VRANGESDZrribkz 6166
-VRANGESDZrrik 6167
-VRANGESDZrrikz 6168
-VRANGESSZrmi 6169
-VRANGESSZrmik 6170
-VRANGESSZrmikz 6171
-VRANGESSZrri 6172
-VRANGESSZrrib 6173
-VRANGESSZrribk 6174
-VRANGESSZrribkz 6175
-VRANGESSZrrik 6176
-VRANGESSZrrikz 6177
-VRCP 6178
-VRCPBF 6179
-VRCPPHZ 6180
-VRCPPHZm 6181
-VRCPPHZmb 6182
-VRCPPHZmbk 6183
-VRCPPHZmbkz 6184
-VRCPPHZmk 6185
-VRCPPHZmkz 6186
-VRCPPHZr 6187
-VRCPPHZrk 6188
-VRCPPHZrkz 6189
-VRCPPSYm 6190
-VRCPPSYr 6191
-VRCPPSm 6192
-VRCPPSr 6193
-VRCPSHZrm 6194
-VRCPSHZrmk 6195
-VRCPSHZrmkz 6196
-VRCPSHZrr 6197
-VRCPSHZrrk 6198
-VRCPSHZrrkz 6199
-VRCPSSm 6200
-VRCPSSm_Int 6201
-VRCPSSr 6202
-VRCPSSr_Int 6203
-VREDUCEBF 6204
-VREDUCEPDZ 6205
-VREDUCEPDZrmbi 6206
-VREDUCEPDZrmbik 6207
-VREDUCEPDZrmbikz 6208
-VREDUCEPDZrmi 6209
-VREDUCEPDZrmik 6210
-VREDUCEPDZrmikz 6211
-VREDUCEPDZrri 6212
-VREDUCEPDZrrib 6213
-VREDUCEPDZrribk 6214
-VREDUCEPDZrribkz 6215
-VREDUCEPDZrrik 6216
-VREDUCEPDZrrikz 6217
-VREDUCEPHZ 6218
-VREDUCEPHZrmbi 6219
-VREDUCEPHZrmbik 6220
-VREDUCEPHZrmbikz 6221
-VREDUCEPHZrmi 6222
-VREDUCEPHZrmik 6223
-VREDUCEPHZrmikz 6224
-VREDUCEPHZrri 6225
-VREDUCEPHZrrib 6226
-VREDUCEPHZrribk 6227
-VREDUCEPHZrribkz 6228
-VREDUCEPHZrrik 6229
-VREDUCEPHZrrikz 6230
-VREDUCEPSZ 6231
-VREDUCEPSZrmbi 6232
-VREDUCEPSZrmbik 6233
-VREDUCEPSZrmbikz 6234
-VREDUCEPSZrmi 6235
-VREDUCEPSZrmik 6236
-VREDUCEPSZrmikz 6237
-VREDUCEPSZrri 6238
-VREDUCEPSZrrib 6239
-VREDUCEPSZrribk 6240
-VREDUCEPSZrribkz 6241
-VREDUCEPSZrrik 6242
-VREDUCEPSZrrikz 6243
-VREDUCESDZrmi 6244
-VREDUCESDZrmik 6245
-VREDUCESDZrmikz 6246
-VREDUCESDZrri 6247
-VREDUCESDZrrib 6248
-VREDUCESDZrribk 6249
-VREDUCESDZrribkz 6250
-VREDUCESDZrrik 6251
-VREDUCESDZrrikz 6252
-VREDUCESHZrmi 6253
-VREDUCESHZrmik 6254
-VREDUCESHZrmikz 6255
-VREDUCESHZrri 6256
-VREDUCESHZrrib 6257
-VREDUCESHZrribk 6258
-VREDUCESHZrribkz 6259
-VREDUCESHZrrik 6260
-VREDUCESHZrrikz 6261
-VREDUCESSZrmi 6262
-VREDUCESSZrmik 6263
-VREDUCESSZrmikz 6264
-VREDUCESSZrri 6265
-VREDUCESSZrrib 6266
-VREDUCESSZrribk 6267
-VREDUCESSZrribkz 6268
-VREDUCESSZrrik 6269
-VREDUCESSZrrikz 6270
-VRNDSCALEBF 6271
-VRNDSCALEPDZ 6272
-VRNDSCALEPDZrmbi 6273
-VRNDSCALEPDZrmbik 6274
-VRNDSCALEPDZrmbikz 6275
-VRNDSCALEPDZrmi 6276
-VRNDSCALEPDZrmik 6277
-VRNDSCALEPDZrmikz 6278
-VRNDSCALEPDZrri 6279
-VRNDSCALEPDZrrib 6280
-VRNDSCALEPDZrribk 6281
-VRNDSCALEPDZrribkz 6282
-VRNDSCALEPDZrrik 6283
-VRNDSCALEPDZrrikz 6284
-VRNDSCALEPHZ 6285
-VRNDSCALEPHZrmbi 6286
-VRNDSCALEPHZrmbik 6287
-VRNDSCALEPHZrmbikz 6288
-VRNDSCALEPHZrmi 6289
-VRNDSCALEPHZrmik 6290
-VRNDSCALEPHZrmikz 6291
-VRNDSCALEPHZrri 6292
-VRNDSCALEPHZrrib 6293
-VRNDSCALEPHZrribk 6294
-VRNDSCALEPHZrribkz 6295
-VRNDSCALEPHZrrik 6296
-VRNDSCALEPHZrrikz 6297
-VRNDSCALEPSZ 6298
-VRNDSCALEPSZrmbi 6299
-VRNDSCALEPSZrmbik 6300
-VRNDSCALEPSZrmbikz 6301
-VRNDSCALEPSZrmi 6302
-VRNDSCALEPSZrmik 6303
-VRNDSCALEPSZrmikz 6304
-VRNDSCALEPSZrri 6305
-VRNDSCALEPSZrrib 6306
-VRNDSCALEPSZrribk 6307
-VRNDSCALEPSZrribkz 6308
-VRNDSCALEPSZrrik 6309
-VRNDSCALEPSZrrikz 6310
-VRNDSCALESDZrmi 6311
-VRNDSCALESDZrmi_Int 6312
-VRNDSCALESDZrmik_Int 6313
-VRNDSCALESDZrmikz_Int 6314
-VRNDSCALESDZrri 6315
-VRNDSCALESDZrri_Int 6316
-VRNDSCALESDZrrib_Int 6317
-VRNDSCALESDZrribk_Int 6318
-VRNDSCALESDZrribkz_Int 6319
-VRNDSCALESDZrrik_Int 6320
-VRNDSCALESDZrrikz_Int 6321
-VRNDSCALESHZrmi 6322
-VRNDSCALESHZrmi_Int 6323
-VRNDSCALESHZrmik_Int 6324
-VRNDSCALESHZrmikz_Int 6325
-VRNDSCALESHZrri 6326
-VRNDSCALESHZrri_Int 6327
-VRNDSCALESHZrrib_Int 6328
-VRNDSCALESHZrribk_Int 6329
-VRNDSCALESHZrribkz_Int 6330
-VRNDSCALESHZrrik_Int 6331
-VRNDSCALESHZrrikz_Int 6332
-VRNDSCALESSZrmi 6333
-VRNDSCALESSZrmi_Int 6334
-VRNDSCALESSZrmik_Int 6335
-VRNDSCALESSZrmikz_Int 6336
-VRNDSCALESSZrri 6337
-VRNDSCALESSZrri_Int 6338
-VRNDSCALESSZrrib_Int 6339
-VRNDSCALESSZrribk_Int 6340
-VRNDSCALESSZrribkz_Int 6341
-VRNDSCALESSZrrik_Int 6342
-VRNDSCALESSZrrikz_Int 6343
-VROUNDPDYmi 6344
-VROUNDPDYri 6345
-VROUNDPDmi 6346
-VROUNDPDri 6347
-VROUNDPSYmi 6348
-VROUNDPSYri 6349
-VROUNDPSmi 6350
-VROUNDPSri 6351
-VROUNDSDmi 6352
-VROUNDSDmi_Int 6353
-VROUNDSDri 6354
-VROUNDSDri_Int 6355
-VROUNDSSmi 6356
-VROUNDSSmi_Int 6357
-VROUNDSSri 6358
-VROUNDSSri_Int 6359
-VRSQRT 6360
-VRSQRTBF 6361
-VRSQRTPHZ 6362
-VRSQRTPHZm 6363
-VRSQRTPHZmb 6364
-VRSQRTPHZmbk 6365
-VRSQRTPHZmbkz 6366
-VRSQRTPHZmk 6367
-VRSQRTPHZmkz 6368
-VRSQRTPHZr 6369
-VRSQRTPHZrk 6370
-VRSQRTPHZrkz 6371
-VRSQRTPSYm 6372
-VRSQRTPSYr 6373
-VRSQRTPSm 6374
-VRSQRTPSr 6375
-VRSQRTSHZrm 6376
-VRSQRTSHZrmk 6377
-VRSQRTSHZrmkz 6378
-VRSQRTSHZrr 6379
-VRSQRTSHZrrk 6380
-VRSQRTSHZrrkz 6381
-VRSQRTSSm 6382
-VRSQRTSSm_Int 6383
-VRSQRTSSr 6384
-VRSQRTSSr_Int 6385
-VSCALEFBF 6386
-VSCALEFPDZ 6387
-VSCALEFPDZrm 6388
-VSCALEFPDZrmb 6389
-VSCALEFPDZrmbk 6390
-VSCALEFPDZrmbkz 6391
-VSCALEFPDZrmk 6392
-VSCALEFPDZrmkz 6393
-VSCALEFPDZrr 6394
-VSCALEFPDZrrb 6395
-VSCALEFPDZrrbk 6396
-VSCALEFPDZrrbkz 6397
-VSCALEFPDZrrk 6398
-VSCALEFPDZrrkz 6399
-VSCALEFPHZ 6400
-VSCALEFPHZrm 6401
-VSCALEFPHZrmb 6402
-VSCALEFPHZrmbk 6403
-VSCALEFPHZrmbkz 6404
-VSCALEFPHZrmk 6405
-VSCALEFPHZrmkz 6406
-VSCALEFPHZrr 6407
-VSCALEFPHZrrb 6408
-VSCALEFPHZrrbk 6409
-VSCALEFPHZrrbkz 6410
-VSCALEFPHZrrk 6411
-VSCALEFPHZrrkz 6412
-VSCALEFPSZ 6413
-VSCALEFPSZrm 6414
-VSCALEFPSZrmb 6415
-VSCALEFPSZrmbk 6416
-VSCALEFPSZrmbkz 6417
-VSCALEFPSZrmk 6418
-VSCALEFPSZrmkz 6419
-VSCALEFPSZrr 6420
-VSCALEFPSZrrb 6421
-VSCALEFPSZrrbk 6422
-VSCALEFPSZrrbkz 6423
-VSCALEFPSZrrk 6424
-VSCALEFPSZrrkz 6425
-VSCALEFSDZrm 6426
-VSCALEFSDZrmk 6427
-VSCALEFSDZrmkz 6428
-VSCALEFSDZrr 6429
-VSCALEFSDZrrb_Int 6430
-VSCALEFSDZrrbk_Int 6431
-VSCALEFSDZrrbkz_Int 6432
-VSCALEFSDZrrk 6433
-VSCALEFSDZrrkz 6434
-VSCALEFSHZrm 6435
-VSCALEFSHZrmk 6436
-VSCALEFSHZrmkz 6437
-VSCALEFSHZrr 6438
-VSCALEFSHZrrb_Int 6439
-VSCALEFSHZrrbk_Int 6440
-VSCALEFSHZrrbkz_Int 6441
-VSCALEFSHZrrk 6442
-VSCALEFSHZrrkz 6443
-VSCALEFSSZrm 6444
-VSCALEFSSZrmk 6445
-VSCALEFSSZrmkz 6446
-VSCALEFSSZrr 6447
-VSCALEFSSZrrb_Int 6448
-VSCALEFSSZrrbk_Int 6449
-VSCALEFSSZrrbkz_Int 6450
-VSCALEFSSZrrk 6451
-VSCALEFSSZrrkz 6452
-VSCATTERDPDZ 6453
-VSCATTERDPDZmr 6454
-VSCATTERDPSZ 6455
-VSCATTERDPSZmr 6456
-VSCATTERPF 6457
-VSCATTERQPDZ 6458
-VSCATTERQPDZmr 6459
-VSCATTERQPSZ 6460
-VSCATTERQPSZmr 6461
-VSHA 6462
-VSHUFF 6463
-VSHUFI 6464
-VSHUFPDYrmi 6465
-VSHUFPDYrri 6466
-VSHUFPDZ 6467
-VSHUFPDZrmbi 6468
-VSHUFPDZrmbik 6469
-VSHUFPDZrmbikz 6470
-VSHUFPDZrmi 6471
-VSHUFPDZrmik 6472
-VSHUFPDZrmikz 6473
-VSHUFPDZrri 6474
-VSHUFPDZrrik 6475
-VSHUFPDZrrikz 6476
-VSHUFPDrmi 6477
-VSHUFPDrri 6478
-VSHUFPSYrmi 6479
-VSHUFPSYrri 6480
-VSHUFPSZ 6481
-VSHUFPSZrmbi 6482
-VSHUFPSZrmbik 6483
-VSHUFPSZrmbikz 6484
-VSHUFPSZrmi 6485
-VSHUFPSZrmik 6486
-VSHUFPSZrmikz 6487
-VSHUFPSZrri 6488
-VSHUFPSZrrik 6489
-VSHUFPSZrrikz 6490
-VSHUFPSrmi 6491
-VSHUFPSrri 6492
-VSM 6493
-VSQRTBF 6494
-VSQRTPDYm 6495
-VSQRTPDYr 6496
-VSQRTPDZ 6497
-VSQRTPDZm 6498
-VSQRTPDZmb 6499
-VSQRTPDZmbk 6500
-VSQRTPDZmbkz 6501
-VSQRTPDZmk 6502
-VSQRTPDZmkz 6503
-VSQRTPDZr 6504
-VSQRTPDZrb 6505
-VSQRTPDZrbk 6506
-VSQRTPDZrbkz 6507
-VSQRTPDZrk 6508
-VSQRTPDZrkz 6509
-VSQRTPDm 6510
-VSQRTPDr 6511
-VSQRTPHZ 6512
-VSQRTPHZm 6513
-VSQRTPHZmb 6514
-VSQRTPHZmbk 6515
-VSQRTPHZmbkz 6516
-VSQRTPHZmk 6517
-VSQRTPHZmkz 6518
-VSQRTPHZr 6519
-VSQRTPHZrb 6520
-VSQRTPHZrbk 6521
-VSQRTPHZrbkz 6522
-VSQRTPHZrk 6523
-VSQRTPHZrkz 6524
-VSQRTPSYm 6525
-VSQRTPSYr 6526
-VSQRTPSZ 6527
-VSQRTPSZm 6528
-VSQRTPSZmb 6529
-VSQRTPSZmbk 6530
-VSQRTPSZmbkz 6531
-VSQRTPSZmk 6532
-VSQRTPSZmkz 6533
-VSQRTPSZr 6534
-VSQRTPSZrb 6535
-VSQRTPSZrbk 6536
-VSQRTPSZrbkz 6537
-VSQRTPSZrk 6538
-VSQRTPSZrkz 6539
-VSQRTPSm 6540
-VSQRTPSr 6541
-VSQRTSDZm 6542
-VSQRTSDZm_Int 6543
-VSQRTSDZmk_Int 6544
-VSQRTSDZmkz_Int 6545
-VSQRTSDZr 6546
-VSQRTSDZr_Int 6547
-VSQRTSDZrb_Int 6548
-VSQRTSDZrbk_Int 6549
-VSQRTSDZrbkz_Int 6550
-VSQRTSDZrk_Int 6551
-VSQRTSDZrkz_Int 6552
-VSQRTSDm 6553
-VSQRTSDm_Int 6554
-VSQRTSDr 6555
-VSQRTSDr_Int 6556
-VSQRTSHZm 6557
-VSQRTSHZm_Int 6558
-VSQRTSHZmk_Int 6559
-VSQRTSHZmkz_Int 6560
-VSQRTSHZr 6561
-VSQRTSHZr_Int 6562
-VSQRTSHZrb_Int 6563
-VSQRTSHZrbk_Int 6564
-VSQRTSHZrbkz_Int 6565
-VSQRTSHZrk_Int 6566
-VSQRTSHZrkz_Int 6567
-VSQRTSSZm 6568
-VSQRTSSZm_Int 6569
-VSQRTSSZmk_Int 6570
-VSQRTSSZmkz_Int 6571
-VSQRTSSZr 6572
-VSQRTSSZr_Int 6573
-VSQRTSSZrb_Int 6574
-VSQRTSSZrbk_Int 6575
-VSQRTSSZrbkz_Int 6576
-VSQRTSSZrk_Int 6577
-VSQRTSSZrkz_Int 6578
-VSQRTSSm 6579
-VSQRTSSm_Int 6580
-VSQRTSSr 6581
-VSQRTSSr_Int 6582
-VSTMXCSR 6583
-VSUBBF 6584
-VSUBPDYrm 6585
-VSUBPDYrr 6586
-VSUBPDZ 6587
-VSUBPDZrm 6588
-VSUBPDZrmb 6589
-VSUBPDZrmbk 6590
-VSUBPDZrmbkz 6591
-VSUBPDZrmk 6592
-VSUBPDZrmkz 6593
-VSUBPDZrr 6594
-VSUBPDZrrb 6595
-VSUBPDZrrbk 6596
-VSUBPDZrrbkz 6597
-VSUBPDZrrk 6598
-VSUBPDZrrkz 6599
-VSUBPDrm 6600
-VSUBPDrr 6601
-VSUBPHZ 6602
-VSUBPHZrm 6603
-VSUBPHZrmb 6604
-VSUBPHZrmbk 6605
-VSUBPHZrmbkz 6606
-VSUBPHZrmk 6607
-VSUBPHZrmkz 6608
-VSUBPHZrr 6609
-VSUBPHZrrb 6610
-VSUBPHZrrbk 6611
-VSUBPHZrrbkz 6612
-VSUBPHZrrk 6613
-VSUBPHZrrkz 6614
-VSUBPSYrm 6615
-VSUBPSYrr 6616
-VSUBPSZ 6617
-VSUBPSZrm 6618
-VSUBPSZrmb 6619
-VSUBPSZrmbk 6620
-VSUBPSZrmbkz 6621
-VSUBPSZrmk 6622
-VSUBPSZrmkz 6623
-VSUBPSZrr 6624
-VSUBPSZrrb 6625
-VSUBPSZrrbk 6626
-VSUBPSZrrbkz 6627
-VSUBPSZrrk 6628
-VSUBPSZrrkz 6629
-VSUBPSrm 6630
-VSUBPSrr 6631
-VSUBSDZrm 6632
-VSUBSDZrm_Int 6633
-VSUBSDZrmk_Int 6634
-VSUBSDZrmkz_Int 6635
-VSUBSDZrr 6636
-VSUBSDZrr_Int 6637
-VSUBSDZrrb_Int 6638
-VSUBSDZrrbk_Int 6639
-VSUBSDZrrbkz_Int 6640
-VSUBSDZrrk_Int 6641
-VSUBSDZrrkz_Int 6642
-VSUBSDrm 6643
-VSUBSDrm_Int 6644
-VSUBSDrr 6645
-VSUBSDrr_Int 6646
-VSUBSHZrm 6647
-VSUBSHZrm_Int 6648
-VSUBSHZrmk_Int 6649
-VSUBSHZrmkz_Int 6650
-VSUBSHZrr 6651
-VSUBSHZrr_Int 6652
-VSUBSHZrrb_Int 6653
-VSUBSHZrrbk_Int 6654
-VSUBSHZrrbkz_Int 6655
-VSUBSHZrrk_Int 6656
-VSUBSHZrrkz_Int 6657
-VSUBSSZrm 6658
-VSUBSSZrm_Int 6659
-VSUBSSZrmk_Int 6660
-VSUBSSZrmkz_Int 6661
-VSUBSSZrr 6662
-VSUBSSZrr_Int 6663
-VSUBSSZrrb_Int 6664
-VSUBSSZrrbk_Int 6665
-VSUBSSZrrbkz_Int 6666
-VSUBSSZrrk_Int 6667
-VSUBSSZrrkz_Int 6668
-VSUBSSrm 6669
-VSUBSSrm_Int 6670
-VSUBSSrr 6671
-VSUBSSrr_Int 6672
-VTESTPDYrm 6673
-VTESTPDYrr 6674
-VTESTPDrm 6675
-VTESTPDrr 6676
-VTESTPSYrm 6677
-VTESTPSYrr 6678
-VTESTPSrm 6679
-VTESTPSrr 6680
-VUCOMISDZrm 6681
-VUCOMISDZrm_Int 6682
-VUCOMISDZrr 6683
-VUCOMISDZrr_Int 6684
-VUCOMISDZrrb 6685
-VUCOMISDrm 6686
-VUCOMISDrm_Int 6687
-VUCOMISDrr 6688
-VUCOMISDrr_Int 6689
-VUCOMISHZrm 6690
-VUCOMISHZrm_Int 6691
-VUCOMISHZrr 6692
-VUCOMISHZrr_Int 6693
-VUCOMISHZrrb 6694
-VUCOMISSZrm 6695
-VUCOMISSZrm_Int 6696
-VUCOMISSZrr 6697
-VUCOMISSZrr_Int 6698
-VUCOMISSZrrb 6699
-VUCOMISSrm 6700
-VUCOMISSrm_Int 6701
-VUCOMISSrr 6702
-VUCOMISSrr_Int 6703
-VUCOMXSDZrm 6704
-VUCOMXSDZrm_Int 6705
-VUCOMXSDZrr 6706
-VUCOMXSDZrr_Int 6707
-VUCOMXSDZrrb_Int 6708
-VUCOMXSHZrm 6709
-VUCOMXSHZrm_Int 6710
-VUCOMXSHZrr 6711
-VUCOMXSHZrr_Int 6712
-VUCOMXSHZrrb_Int 6713
-VUCOMXSSZrm 6714
-VUCOMXSSZrm_Int 6715
-VUCOMXSSZrr 6716
-VUCOMXSSZrr_Int 6717
-VUCOMXSSZrrb_Int 6718
-VUNPCKHPDYrm 6719
-VUNPCKHPDYrr 6720
-VUNPCKHPDZ 6721
-VUNPCKHPDZrm 6722
-VUNPCKHPDZrmb 6723
-VUNPCKHPDZrmbk 6724
-VUNPCKHPDZrmbkz 6725
-VUNPCKHPDZrmk 6726
-VUNPCKHPDZrmkz 6727
-VUNPCKHPDZrr 6728
-VUNPCKHPDZrrk 6729
-VUNPCKHPDZrrkz 6730
-VUNPCKHPDrm 6731
-VUNPCKHPDrr 6732
-VUNPCKHPSYrm 6733
-VUNPCKHPSYrr 6734
-VUNPCKHPSZ 6735
-VUNPCKHPSZrm 6736
-VUNPCKHPSZrmb 6737
-VUNPCKHPSZrmbk 6738
-VUNPCKHPSZrmbkz 6739
-VUNPCKHPSZrmk 6740
-VUNPCKHPSZrmkz 6741
-VUNPCKHPSZrr 6742
-VUNPCKHPSZrrk 6743
-VUNPCKHPSZrrkz 6744
-VUNPCKHPSrm 6745
-VUNPCKHPSrr 6746
-VUNPCKLPDYrm 6747
-VUNPCKLPDYrr 6748
-VUNPCKLPDZ 6749
-VUNPCKLPDZrm 6750
-VUNPCKLPDZrmb 6751
-VUNPCKLPDZrmbk 6752
-VUNPCKLPDZrmbkz 6753
-VUNPCKLPDZrmk 6754
-VUNPCKLPDZrmkz 6755
-VUNPCKLPDZrr 6756
-VUNPCKLPDZrrk 6757
-VUNPCKLPDZrrkz 6758
-VUNPCKLPDrm 6759
-VUNPCKLPDrr 6760
-VUNPCKLPSYrm 6761
-VUNPCKLPSYrr 6762
-VUNPCKLPSZ 6763
-VUNPCKLPSZrm 6764
-VUNPCKLPSZrmb 6765
-VUNPCKLPSZrmbk 6766
-VUNPCKLPSZrmbkz 6767
-VUNPCKLPSZrmk 6768
-VUNPCKLPSZrmkz 6769
-VUNPCKLPSZrr 6770
-VUNPCKLPSZrrk 6771
-VUNPCKLPSZrrkz 6772
-VUNPCKLPSrm 6773
-VUNPCKLPSrr 6774
-VXORPDYrm 6775
-VXORPDYrr 6776
-VXORPDZ 6777
-VXORPDZrm 6778
-VXORPDZrmb 6779
-VXORPDZrmbk 6780
-VXORPDZrmbkz 6781
-VXORPDZrmk 6782
-VXORPDZrmkz 6783
-VXORPDZrr 6784
-VXORPDZrrk 6785
-VXORPDZrrkz 6786
-VXORPDrm 6787
-VXORPDrr 6788
-VXORPSYrm 6789
-VXORPSYrr 6790
-VXORPSZ 6791
-VXORPSZrm 6792
-VXORPSZrmb 6793
-VXORPSZrmbk 6794
-VXORPSZrmbkz 6795
-VXORPSZrmk 6796
-VXORPSZrmkz 6797
-VXORPSZrr 6798
-VXORPSZrrk 6799
-VXORPSZrrkz 6800
-VXORPSrm 6801
-VXORPSrr 6802
-VZEROALL 6803
-VZEROUPPER 6804
-V_SET 6805
-V_SETALLONES 6806
-WAIT 6807
-WBINVD 6808
-WBNOINVD 6809
-WRFLAGS 6810
-WRFSBASE 6811
-WRGSBASE 6812
-WRMSR 6813
-WRMSRLIST 6814
-WRMSRNS 6815
-WRMSRNSir 6816
-WRMSRNSir_EVEX 6817
-WRPKRUr 6818
-WRSSD 6819
-WRSSD_EVEX 6820
-WRSSQ 6821
-WRSSQ_EVEX 6822
-WRUSSD 6823
-WRUSSD_EVEX 6824
-WRUSSQ 6825
-WRUSSQ_EVEX 6826
-XABORT 6827
-XABORT_DEF 6828
-XACQUIRE_PREFIX 6829
-XADD 6830
-XAM_F 6831
-XAM_Fp 6832
-XBEGIN 6833
-XCHG 6834
-XCH_F 6835
-XCRYPTCBC 6836
-XCRYPTCFB 6837
-XCRYPTCTR 6838
-XCRYPTECB 6839
-XCRYPTOFB 6840
-XEND 6841
-XGETBV 6842
-XLAT 6843
-XOR 6844
-XORPDrm 6845
-XORPDrr 6846
-XORPSrm 6847
-XORPSrr 6848
-XRELEASE_PREFIX 6849
-XRESLDTRK 6850
-XRSTOR 6851
-XRSTORS 6852
-XSAVE 6853
-XSAVEC 6854
-XSAVEOPT 6855
-XSAVES 6856
-XSETBV 6857
-XSHA 6858
-XSTORE 6859
-XSUSLDTRK 6860
-XTEST 6861
-Immediate 6862
-CImmediate 6863
-FPImmediate 6864
-MBB 6865
-FrameIndex 6866
-ConstantPoolIndex 6867
-TargetIndex 6868
-JumpTableIndex 6869
-ExternalSymbol 6870
-GlobalAddress 6871
-BlockAddress 6872
-RegisterMask 6873
-RegisterLiveOut 6874
-Metadata 6875
-MCSymbol 6876
-CFIIndex 6877
-IntrinsicID 6878
-Predicate 6879
-ShuffleMask 6880
-PhyReg_GR8 6881
-PhyReg_GRH8 6882
-PhyReg_GR8_NOREX2 6883
-PhyReg_GR8_NOREX 6884
-PhyReg_GR8_ABCD_H 6885
-PhyReg_GR8_ABCD_L 6886
-PhyReg_GRH16 6887
-PhyReg_GR16 6888
-PhyReg_GR16_NOREX2 6889
-PhyReg_GR16_NOREX 6890
-PhyReg_VK1 6891
-PhyReg_VK16 6892
-PhyReg_VK2 6893
-PhyReg_VK4 6894
-PhyReg_VK8 6895
-PhyReg_VK16WM 6896
-PhyReg_VK1WM 6897
-PhyReg_VK2WM 6898
-PhyReg_VK4WM 6899
-PhyReg_VK8WM 6900
-PhyReg_SEGMENT_REG 6901
-PhyReg_GR16_ABCD 6902
-PhyReg_FPCCR 6903
-PhyReg_FR16X 6904
-PhyReg_FR16 6905
-PhyReg_VK16PAIR 6906
-PhyReg_VK1PAIR 6907
-PhyReg_VK2PAIR 6908
-PhyReg_VK4PAIR 6909
-PhyReg_VK8PAIR 6910
-PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM 6911
-PhyReg_LOW32_ADDR_ACCESS_RBP 6912
-PhyReg_LOW32_ADDR_ACCESS 6913
-PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit 6914
-PhyReg_FR32X 6915
-PhyReg_GR32 6916
-PhyReg_GR32_NOSP 6917
-PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2 6918
-PhyReg_DEBUG_REG 6919
-PhyReg_FR32 6920
-PhyReg_GR32_NOREX2 6921
-PhyReg_GR32_NOREX2_NOSP 6922
-PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX 6923
-PhyReg_GR32_NOREX 6924
-PhyReg_VK32 6925
-PhyReg_GR32_NOREX_NOSP 6926
-PhyReg_RFP32 6927
-PhyReg_VK32WM 6928
-PhyReg_GR32_ABCD 6929
-PhyReg_GR32_TC 6930
-PhyReg_GR32_ABCD_and_GR32_TC 6931
-PhyReg_GR32_AD 6932
-PhyReg_GR32_ArgRef 6933
-PhyReg_GR32_BPSP 6934
-PhyReg_GR32_BSI 6935
-PhyReg_GR32_CB 6936
-PhyReg_GR32_DC 6937
-PhyReg_GR32_DIBP 6938
-PhyReg_GR32_SIDI 6939
-PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit 6940
-PhyReg_CCR 6941
-PhyReg_DFCCR 6942
-PhyReg_GR32_ABCD_and_GR32_BSI 6943
-PhyReg_GR32_AD_and_GR32_ArgRef 6944
-PhyReg_GR32_ArgRef_and_GR32_CB 6945
-PhyReg_GR32_BPSP_and_GR32_DIBP 6946
-PhyReg_GR32_BPSP_and_GR32_TC 6947
-PhyReg_GR32_BSI_and_GR32_SIDI 6948
-PhyReg_GR32_DIBP_and_GR32_SIDI 6949
-PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit 6950
-PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit 6951
-PhyReg_RFP64 6952
-PhyReg_GR64 6953
-PhyReg_FR64X 6954
-PhyReg_GR64_with_sub_8bit 6955
-PhyReg_GR64_NOSP 6956
-PhyReg_GR64_NOREX2 6957
-PhyReg_CONTROL_REG 6958
-PhyReg_FR64 6959
-PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2 6960
-PhyReg_GR64_NOREX2_NOSP 6961
-PhyReg_GR64PLTSafe 6962
-PhyReg_GR64_TC 6963
-PhyReg_GR64_NOREX 6964
-PhyReg_GR64_TCW64 6965
-PhyReg_GR64_TC_with_sub_8bit 6966
-PhyReg_GR64_NOREX2_NOSP_and_GR64_TC 6967
-PhyReg_GR64_TCW64_with_sub_8bit 6968
-PhyReg_GR64_TC_and_GR64_TCW64 6969
-PhyReg_GR64_with_sub_16bit_in_GR16_NOREX 6970
-PhyReg_VK64 6971
-PhyReg_VR64 6972
-PhyReg_GR64PLTSafe_and_GR64_TC 6973
-PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64 6974
-PhyReg_GR64_NOREX_NOSP 6975
-PhyReg_GR64_NOREX_and_GR64_TC 6976
-PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit 6977
-PhyReg_VK64WM 6978
-PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64 6979
-PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX 6980
-PhyReg_GR64PLTSafe_and_GR64_TCW64 6981
-PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC 6982
-PhyReg_GR64_NOREX_and_GR64_TCW64 6983
-PhyReg_GR64_ABCD 6984
-PhyReg_GR64_with_sub_32bit_in_GR32_TC 6985
-PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC 6986
-PhyReg_GR64_AD 6987
-PhyReg_GR64_ArgRef 6988
-PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP 6989
-PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef 6990
-PhyReg_GR64_with_sub_32bit_in_GR32_BPSP 6991
-PhyReg_GR64_with_sub_32bit_in_GR32_BSI 6992
-PhyReg_GR64_with_sub_32bit_in_GR32_CB 6993
-PhyReg_GR64_with_sub_32bit_in_GR32_DIBP 6994
-PhyReg_GR64_with_sub_32bit_in_GR32_SIDI 6995
-PhyReg_GR64_A 6996
-PhyReg_GR64_ArgRef_and_GR64_TC 6997
-PhyReg_GR64_and_LOW32_ADDR_ACCESS 6998
-PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI 6999
-PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef 7000
-PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB 7001
-PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP 7002
-PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC 7003
-PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI 7004
-PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI 7005
-PhyReg_RST 7006
-PhyReg_RFP80 7007
-PhyReg_RFP80_7 7008
-PhyReg_VR128X 7009
-PhyReg_VR128 7010
-PhyReg_VR256X 7011
-PhyReg_VR256 7012
-PhyReg_VR512 7013
-PhyReg_VR512_0_15 7014
-PhyReg_TILE 7015
-VirtReg_GR8 7016
-VirtReg_GRH8 7017
-VirtReg_GR8_NOREX2 7018
-VirtReg_GR8_NOREX 7019
-VirtReg_GR8_ABCD_H 7020
-VirtReg_GR8_ABCD_L 7021
-VirtReg_GRH16 7022
-VirtReg_GR16 7023
-VirtReg_GR16_NOREX2 7024
-VirtReg_GR16_NOREX 7025
-VirtReg_VK1 7026
-VirtReg_VK16 7027
-VirtReg_VK2 7028
-VirtReg_VK4 7029
-VirtReg_VK8 7030
-VirtReg_VK16WM 7031
-VirtReg_VK1WM 7032
-VirtReg_VK2WM 7033
-VirtReg_VK4WM 7034
-VirtReg_VK8WM 7035
-VirtReg_SEGMENT_REG 7036
-VirtReg_GR16_ABCD 7037
-VirtReg_FPCCR 7038
-VirtReg_FR16X 7039
-VirtReg_FR16 7040
-VirtReg_VK16PAIR 7041
-VirtReg_VK1PAIR 7042
-VirtReg_VK2PAIR 7043
-VirtReg_VK4PAIR 7044
-VirtReg_VK8PAIR 7045
-VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM 7046
-VirtReg_LOW32_ADDR_ACCESS_RBP 7047
-VirtReg_LOW32_ADDR_ACCESS 7048
-VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit 7049
-VirtReg_FR32X 7050
-VirtReg_GR32 7051
-VirtReg_GR32_NOSP 7052
-VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2 7053
-VirtReg_DEBUG_REG 7054
-VirtReg_FR32 7055
-VirtReg_GR32_NOREX2 7056
-VirtReg_GR32_NOREX2_NOSP 7057
-VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX 7058
-VirtReg_GR32_NOREX 7059
-VirtReg_VK32 7060
-VirtReg_GR32_NOREX_NOSP 7061
-VirtReg_RFP32 7062
-VirtReg_VK32WM 7063
-VirtReg_GR32_ABCD 7064
-VirtReg_GR32_TC 7065
-VirtReg_GR32_ABCD_and_GR32_TC 7066
-VirtReg_GR32_AD 7067
-VirtReg_GR32_ArgRef 7068
-VirtReg_GR32_BPSP 7069
-VirtReg_GR32_BSI 7070
-VirtReg_GR32_CB 7071
-VirtReg_GR32_DC 7072
-VirtReg_GR32_DIBP 7073
-VirtReg_GR32_SIDI 7074
-VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit 7075
-VirtReg_CCR 7076
-VirtReg_DFCCR 7077
-VirtReg_GR32_ABCD_and_GR32_BSI 7078
-VirtReg_GR32_AD_and_GR32_ArgRef 7079
-VirtReg_GR32_ArgRef_and_GR32_CB 7080
-VirtReg_GR32_BPSP_and_GR32_DIBP 7081
-VirtReg_GR32_BPSP_and_GR32_TC 7082
-VirtReg_GR32_BSI_and_GR32_SIDI 7083
-VirtReg_GR32_DIBP_and_GR32_SIDI 7084
-VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit 7085
-VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit 7086
-VirtReg_RFP64 7087
-VirtReg_GR64 7088
-VirtReg_FR64X 7089
-VirtReg_GR64_with_sub_8bit 7090
-VirtReg_GR64_NOSP 7091
-VirtReg_GR64_NOREX2 7092
-VirtReg_CONTROL_REG 7093
-VirtReg_FR64 7094
-VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2 7095
-VirtReg_GR64_NOREX2_NOSP 7096
-VirtReg_GR64PLTSafe 7097
-VirtReg_GR64_TC 7098
-VirtReg_GR64_NOREX 7099
-VirtReg_GR64_TCW64 7100
-VirtReg_GR64_TC_with_sub_8bit 7101
-VirtReg_GR64_NOREX2_NOSP_and_GR64_TC 7102
-VirtReg_GR64_TCW64_with_sub_8bit 7103
-VirtReg_GR64_TC_and_GR64_TCW64 7104
-VirtReg_GR64_with_sub_16bit_in_GR16_NOREX 7105
-VirtReg_VK64 7106
-VirtReg_VR64 7107
-VirtReg_GR64PLTSafe_and_GR64_TC 7108
-VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64 7109
-VirtReg_GR64_NOREX_NOSP 7110
-VirtReg_GR64_NOREX_and_GR64_TC 7111
-VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit 7112
-VirtReg_VK64WM 7113
-VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64 7114
-VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX 7115
-VirtReg_GR64PLTSafe_and_GR64_TCW64 7116
-VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC 7117
-VirtReg_GR64_NOREX_and_GR64_TCW64 7118
-VirtReg_GR64_ABCD 7119
-VirtReg_GR64_with_sub_32bit_in_GR32_TC 7120
-VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC 7121
-VirtReg_GR64_AD 7122
-VirtReg_GR64_ArgRef 7123
-VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP 7124
-VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef 7125
-VirtReg_GR64_with_sub_32bit_in_GR32_BPSP 7126
-VirtReg_GR64_with_sub_32bit_in_GR32_BSI 7127
-VirtReg_GR64_with_sub_32bit_in_GR32_CB 7128
-VirtReg_GR64_with_sub_32bit_in_GR32_DIBP 7129
-VirtReg_GR64_with_sub_32bit_in_GR32_SIDI 7130
-VirtReg_GR64_A 7131
-VirtReg_GR64_ArgRef_and_GR64_TC 7132
-VirtReg_GR64_and_LOW32_ADDR_ACCESS 7133
-VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI 7134
-VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef 7135
-VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB 7136
-VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP 7137
-VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC 7138
-VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI 7139
-VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI 7140
-VirtReg_RST 7141
-VirtReg_RFP80 7142
-VirtReg_RFP80_7 7143
-VirtReg_VR128X 7144
-VirtReg_VR128 7145
-VirtReg_VR256X 7146
-VirtReg_VR256 7147
-VirtReg_VR512 7148
-VirtReg_VR512_0_15 7149
-VirtReg_TILE 7150
+RELOC_NONE 1533
+REPNE_PREFIX 1534
+REP_MOVSB 1535
+REP_MOVSD 1536
+REP_MOVSQ 1537
+REP_MOVSW 1538
+REP_PREFIX 1539
+REP_STOSB 1540
+REP_STOSD 1541
+REP_STOSQ 1542
+REP_STOSW 1543
+RET 1544
+RETI 1545
+REX 1546
+RMPADJUST 1547
+RMPQUERY 1548
+RMPUPDATE 1549
+ROL 1550
+ROR 1551
+RORX 1552
+ROUNDPDmi 1553
+ROUNDPDri 1554
+ROUNDPSmi 1555
+ROUNDPSri 1556
+ROUNDSDmi 1557
+ROUNDSDmi_Int 1558
+ROUNDSDri 1559
+ROUNDSDri_Int 1560
+ROUNDSSmi 1561
+ROUNDSSmi_Int 1562
+ROUNDSSri 1563
+ROUNDSSri_Int 1564
+RSM 1565
+RSQRTPSm 1566
+RSQRTPSr 1567
+RSQRTSSm 1568
+RSQRTSSm_Int 1569
+RSQRTSSr 1570
+RSQRTSSr_Int 1571
+RSTORSSP 1572
+SAHF 1573
+SALC 1574
+SAR 1575
+SARX 1576
+SAVEPREVSSP 1577
+SBB 1578
+SCASB 1579
+SCASL 1580
+SCASQ 1581
+SCASW 1582
+SEAMCALL 1583
+SEAMOPS 1584
+SEAMRET 1585
+SEG_ALLOCA 1586
+SEH_BeginEpilogue 1587
+SEH_EndEpilogue 1588
+SEH_EndPrologue 1589
+SEH_PushFrame 1590
+SEH_PushReg 1591
+SEH_SaveReg 1592
+SEH_SaveXMM 1593
+SEH_SetFrame 1594
+SEH_StackAlign 1595
+SEH_StackAlloc 1596
+SEH_UnwindV 1597
+SEH_UnwindVersion 1598
+SENDUIPI 1599
+SERIALIZE 1600
+SETB_C 1601
+SETCCm 1602
+SETCCm_EVEX 1603
+SETCCr 1604
+SETCCr_EVEX 1605
+SETSSBSY 1606
+SETZUCCm 1607
+SETZUCCr 1608
+SFENCE 1609
+SGDT 1610
+SHA 1611
+SHL 1612
+SHLD 1613
+SHLDROT 1614
+SHLX 1615
+SHR 1616
+SHRD 1617
+SHRDROT 1618
+SHRX 1619
+SHUFPDrmi 1620
+SHUFPDrri 1621
+SHUFPSrmi 1622
+SHUFPSrri 1623
+SIDT 1624
+SKINIT 1625
+SLDT 1626
+SLWPCB 1627
+SMSW 1628
+SQRTPDm 1629
+SQRTPDr 1630
+SQRTPSm 1631
+SQRTPSr 1632
+SQRTSDm 1633
+SQRTSDm_Int 1634
+SQRTSDr 1635
+SQRTSDr_Int 1636
+SQRTSSm 1637
+SQRTSSm_Int 1638
+SQRTSSr 1639
+SQRTSSr_Int 1640
+SQRT_F 1641
+SQRT_Fp 1642
+SS_PREFIX 1643
+STAC 1644
+STACKALLOC_W_PROBING 1645
+STACKMAP 1646
+STATEPOINT 1647
+STC 1648
+STD 1649
+STGI 1650
+STI 1651
+STMXCSR 1652
+STOSB 1653
+STOSL 1654
+STOSQ 1655
+STOSW 1656
+STR 1657
+STRm 1658
+STTILECFG 1659
+STTILECFG_EVEX 1660
+STUI 1661
+ST_F 1662
+ST_FP 1663
+ST_FPrr 1664
+ST_Fp 1665
+ST_FpP 1666
+ST_Frr 1667
+SUB 1668
+SUBPDrm 1669
+SUBPDrr 1670
+SUBPSrm 1671
+SUBPSrr 1672
+SUBREG_TO_REG 1673
+SUBR_F 1674
+SUBR_FI 1675
+SUBR_FPrST 1676
+SUBR_FST 1677
+SUBR_Fp 1678
+SUBR_FpI 1679
+SUBR_FrST 1680
+SUBSDrm 1681
+SUBSDrm_Int 1682
+SUBSDrr 1683
+SUBSDrr_Int 1684
+SUBSSrm 1685
+SUBSSrm_Int 1686
+SUBSSrr 1687
+SUBSSrr_Int 1688
+SUB_F 1689
+SUB_FI 1690
+SUB_FPrST 1691
+SUB_FST 1692
+SUB_Fp 1693
+SUB_FpI 1694
+SUB_FrST 1695
+SWAPGS 1696
+SYSCALL 1697
+SYSENTER 1698
+SYSEXIT 1699
+SYSRET 1700
+T 1701
+TAILJMPd 1702
+TAILJMPd_CC 1703
+TAILJMPm 1704
+TAILJMPr 1705
+TCMMIMFP 1706
+TCMMRLFP 1707
+TCRETURN_HIPE 1708
+TCRETURN_WIN 1709
+TCRETURN_WINmi 1710
+TCRETURNdi 1711
+TCRETURNdicc 1712
+TCRETURNmi 1713
+TCRETURNri 1714
+TCVTROWD 1715
+TCVTROWPS 1716
+TDCALL 1717
+TDPBF 1718
+TDPBHF 1719
+TDPBSSD 1720
+TDPBSUD 1721
+TDPBUSD 1722
+TDPBUUD 1723
+TDPFP 1724
+TDPHBF 1725
+TDPHF 1726
+TEST 1727
+TESTUI 1728
+TILELOADD 1729
+TILELOADDRS 1730
+TILELOADDRST 1731
+TILELOADDRS_EVEX 1732
+TILELOADDT 1733
+TILELOADD_EVEX 1734
+TILEMOVROWrre 1735
+TILEMOVROWrri 1736
+TILERELEASE 1737
+TILESTORED 1738
+TILESTORED_EVEX 1739
+TILEZERO 1740
+TLBSYNC 1741
+TLSCall 1742
+TLS_addr 1743
+TLS_addrX 1744
+TLS_base_addr 1745
+TLS_base_addrX 1746
+TLS_desc 1747
+TMMULTF 1748
+TPAUSE 1749
+TRAP 1750
+TST_F 1751
+TST_Fp 1752
+TZCNT 1753
+TZMSK 1754
+UBSAN_UD 1755
+UCOMISDrm 1756
+UCOMISDrm_Int 1757
+UCOMISDrr 1758
+UCOMISDrr_Int 1759
+UCOMISSrm 1760
+UCOMISSrm_Int 1761
+UCOMISSrr 1762
+UCOMISSrr_Int 1763
+UCOM_FIPr 1764
+UCOM_FIr 1765
+UCOM_FPPr 1766
+UCOM_FPr 1767
+UCOM_FpIr 1768
+UCOM_Fpr 1769
+UCOM_Fr 1770
+UD 1771
+UIRET 1772
+UMONITOR 1773
+UMWAIT 1774
+UNPCKHPDrm 1775
+UNPCKHPDrr 1776
+UNPCKHPSrm 1777
+UNPCKHPSrr 1778
+UNPCKLPDrm 1779
+UNPCKLPDrr 1780
+UNPCKLPSrm 1781
+UNPCKLPSrr 1782
+URDMSRri 1783
+URDMSRri_EVEX 1784
+URDMSRrr 1785
+URDMSRrr_EVEX 1786
+UWRMSRir 1787
+UWRMSRir_EVEX 1788
+UWRMSRrr 1789
+UWRMSRrr_EVEX 1790
+V 1791
+VAARG 1792
+VAARG_X 1793
+VADDBF 1794
+VADDPDYrm 1795
+VADDPDYrr 1796
+VADDPDZ 1797
+VADDPDZrm 1798
+VADDPDZrmb 1799
+VADDPDZrmbk 1800
+VADDPDZrmbkz 1801
+VADDPDZrmk 1802
+VADDPDZrmkz 1803
+VADDPDZrr 1804
+VADDPDZrrb 1805
+VADDPDZrrbk 1806
+VADDPDZrrbkz 1807
+VADDPDZrrk 1808
+VADDPDZrrkz 1809
+VADDPDrm 1810
+VADDPDrr 1811
+VADDPHZ 1812
+VADDPHZrm 1813
+VADDPHZrmb 1814
+VADDPHZrmbk 1815
+VADDPHZrmbkz 1816
+VADDPHZrmk 1817
+VADDPHZrmkz 1818
+VADDPHZrr 1819
+VADDPHZrrb 1820
+VADDPHZrrbk 1821
+VADDPHZrrbkz 1822
+VADDPHZrrk 1823
+VADDPHZrrkz 1824
+VADDPSYrm 1825
+VADDPSYrr 1826
+VADDPSZ 1827
+VADDPSZrm 1828
+VADDPSZrmb 1829
+VADDPSZrmbk 1830
+VADDPSZrmbkz 1831
+VADDPSZrmk 1832
+VADDPSZrmkz 1833
+VADDPSZrr 1834
+VADDPSZrrb 1835
+VADDPSZrrbk 1836
+VADDPSZrrbkz 1837
+VADDPSZrrk 1838
+VADDPSZrrkz 1839
+VADDPSrm 1840
+VADDPSrr 1841
+VADDSDZrm 1842
+VADDSDZrm_Int 1843
+VADDSDZrmk_Int 1844
+VADDSDZrmkz_Int 1845
+VADDSDZrr 1846
+VADDSDZrr_Int 1847
+VADDSDZrrb_Int 1848
+VADDSDZrrbk_Int 1849
+VADDSDZrrbkz_Int 1850
+VADDSDZrrk_Int 1851
+VADDSDZrrkz_Int 1852
+VADDSDrm 1853
+VADDSDrm_Int 1854
+VADDSDrr 1855
+VADDSDrr_Int 1856
+VADDSHZrm 1857
+VADDSHZrm_Int 1858
+VADDSHZrmk_Int 1859
+VADDSHZrmkz_Int 1860
+VADDSHZrr 1861
+VADDSHZrr_Int 1862
+VADDSHZrrb_Int 1863
+VADDSHZrrbk_Int 1864
+VADDSHZrrbkz_Int 1865
+VADDSHZrrk_Int 1866
+VADDSHZrrkz_Int 1867
+VADDSSZrm 1868
+VADDSSZrm_Int 1869
+VADDSSZrmk_Int 1870
+VADDSSZrmkz_Int 1871
+VADDSSZrr 1872
+VADDSSZrr_Int 1873
+VADDSSZrrb_Int 1874
+VADDSSZrrbk_Int 1875
+VADDSSZrrbkz_Int 1876
+VADDSSZrrk_Int 1877
+VADDSSZrrkz_Int 1878
+VADDSSrm 1879
+VADDSSrm_Int 1880
+VADDSSrr 1881
+VADDSSrr_Int 1882
+VADDSUBPDYrm 1883
+VADDSUBPDYrr 1884
+VADDSUBPDrm 1885
+VADDSUBPDrr 1886
+VADDSUBPSYrm 1887
+VADDSUBPSYrr 1888
+VADDSUBPSrm 1889
+VADDSUBPSrr 1890
+VAESDECLASTYrm 1891
+VAESDECLASTYrr 1892
+VAESDECLASTZ 1893
+VAESDECLASTZrm 1894
+VAESDECLASTZrr 1895
+VAESDECLASTrm 1896
+VAESDECLASTrr 1897
+VAESDECYrm 1898
+VAESDECYrr 1899
+VAESDECZ 1900
+VAESDECZrm 1901
+VAESDECZrr 1902
+VAESDECrm 1903
+VAESDECrr 1904
+VAESENCLASTYrm 1905
+VAESENCLASTYrr 1906
+VAESENCLASTZ 1907
+VAESENCLASTZrm 1908
+VAESENCLASTZrr 1909
+VAESENCLASTrm 1910
+VAESENCLASTrr 1911
+VAESENCYrm 1912
+VAESENCYrr 1913
+VAESENCZ 1914
+VAESENCZrm 1915
+VAESENCZrr 1916
+VAESENCrm 1917
+VAESENCrr 1918
+VAESIMCrm 1919
+VAESIMCrr 1920
+VAESKEYGENASSISTrmi 1921
+VAESKEYGENASSISTrri 1922
+VALIGNDZ 1923
+VALIGNDZrmbi 1924
+VALIGNDZrmbik 1925
+VALIGNDZrmbikz 1926
+VALIGNDZrmi 1927
+VALIGNDZrmik 1928
+VALIGNDZrmikz 1929
+VALIGNDZrri 1930
+VALIGNDZrrik 1931
+VALIGNDZrrikz 1932
+VALIGNQZ 1933
+VALIGNQZrmbi 1934
+VALIGNQZrmbik 1935
+VALIGNQZrmbikz 1936
+VALIGNQZrmi 1937
+VALIGNQZrmik 1938
+VALIGNQZrmikz 1939
+VALIGNQZrri 1940
+VALIGNQZrrik 1941
+VALIGNQZrrikz 1942
+VANDNPDYrm 1943
+VANDNPDYrr 1944
+VANDNPDZ 1945
+VANDNPDZrm 1946
+VANDNPDZrmb 1947
+VANDNPDZrmbk 1948
+VANDNPDZrmbkz 1949
+VANDNPDZrmk 1950
+VANDNPDZrmkz 1951
+VANDNPDZrr 1952
+VANDNPDZrrk 1953
+VANDNPDZrrkz 1954
+VANDNPDrm 1955
+VANDNPDrr 1956
+VANDNPSYrm 1957
+VANDNPSYrr 1958
+VANDNPSZ 1959
+VANDNPSZrm 1960
+VANDNPSZrmb 1961
+VANDNPSZrmbk 1962
+VANDNPSZrmbkz 1963
+VANDNPSZrmk 1964
+VANDNPSZrmkz 1965
+VANDNPSZrr 1966
+VANDNPSZrrk 1967
+VANDNPSZrrkz 1968
+VANDNPSrm 1969
+VANDNPSrr 1970
+VANDPDYrm 1971
+VANDPDYrr 1972
+VANDPDZ 1973
+VANDPDZrm 1974
+VANDPDZrmb 1975
+VANDPDZrmbk 1976
+VANDPDZrmbkz 1977
+VANDPDZrmk 1978
+VANDPDZrmkz 1979
+VANDPDZrr 1980
+VANDPDZrrk 1981
+VANDPDZrrkz 1982
+VANDPDrm 1983
+VANDPDrr 1984
+VANDPSYrm 1985
+VANDPSYrr 1986
+VANDPSZ 1987
+VANDPSZrm 1988
+VANDPSZrmb 1989
+VANDPSZrmbk 1990
+VANDPSZrmbkz 1991
+VANDPSZrmk 1992
+VANDPSZrmkz 1993
+VANDPSZrr 1994
+VANDPSZrrk 1995
+VANDPSZrrkz 1996
+VANDPSrm 1997
+VANDPSrr 1998
+VASTART_SAVE_XMM_REGS 1999
+VBCSTNEBF 2000
+VBCSTNESH 2001
+VBLENDMPDZ 2002
+VBLENDMPDZrm 2003
+VBLENDMPDZrmb 2004
+VBLENDMPDZrmbk 2005
+VBLENDMPDZrmbkz 2006
+VBLENDMPDZrmk 2007
+VBLENDMPDZrmkz 2008
+VBLENDMPDZrr 2009
+VBLENDMPDZrrk 2010
+VBLENDMPDZrrkz 2011
+VBLENDMPSZ 2012
+VBLENDMPSZrm 2013
+VBLENDMPSZrmb 2014
+VBLENDMPSZrmbk 2015
+VBLENDMPSZrmbkz 2016
+VBLENDMPSZrmk 2017
+VBLENDMPSZrmkz 2018
+VBLENDMPSZrr 2019
+VBLENDMPSZrrk 2020
+VBLENDMPSZrrkz 2021
+VBLENDPDYrmi 2022
+VBLENDPDYrri 2023
+VBLENDPDrmi 2024
+VBLENDPDrri 2025
+VBLENDPSYrmi 2026
+VBLENDPSYrri 2027
+VBLENDPSrmi 2028
+VBLENDPSrri 2029
+VBLENDVPDYrmr 2030
+VBLENDVPDYrrr 2031
+VBLENDVPDrmr 2032
+VBLENDVPDrrr 2033
+VBLENDVPSYrmr 2034
+VBLENDVPSYrrr 2035
+VBLENDVPSrmr 2036
+VBLENDVPSrrr 2037
+VBROADCASTF 2038
+VBROADCASTI 2039
+VBROADCASTSDYrm 2040
+VBROADCASTSDYrr 2041
+VBROADCASTSDZ 2042
+VBROADCASTSDZrm 2043
+VBROADCASTSDZrmk 2044
+VBROADCASTSDZrmkz 2045
+VBROADCASTSDZrr 2046
+VBROADCASTSDZrrk 2047
+VBROADCASTSDZrrkz 2048
+VBROADCASTSSYrm 2049
+VBROADCASTSSYrr 2050
+VBROADCASTSSZ 2051
+VBROADCASTSSZrm 2052
+VBROADCASTSSZrmk 2053
+VBROADCASTSSZrmkz 2054
+VBROADCASTSSZrr 2055
+VBROADCASTSSZrrk 2056
+VBROADCASTSSZrrkz 2057
+VBROADCASTSSrm 2058
+VBROADCASTSSrr 2059
+VCMPBF 2060
+VCMPPDYrmi 2061
+VCMPPDYrri 2062
+VCMPPDZ 2063
+VCMPPDZrmbi 2064
+VCMPPDZrmbik 2065
+VCMPPDZrmi 2066
+VCMPPDZrmik 2067
+VCMPPDZrri 2068
+VCMPPDZrrib 2069
+VCMPPDZrribk 2070
+VCMPPDZrrik 2071
+VCMPPDrmi 2072
+VCMPPDrri 2073
+VCMPPHZ 2074
+VCMPPHZrmbi 2075
+VCMPPHZrmbik 2076
+VCMPPHZrmi 2077
+VCMPPHZrmik 2078
+VCMPPHZrri 2079
+VCMPPHZrrib 2080
+VCMPPHZrribk 2081
+VCMPPHZrrik 2082
+VCMPPSYrmi 2083
+VCMPPSYrri 2084
+VCMPPSZ 2085
+VCMPPSZrmbi 2086
+VCMPPSZrmbik 2087
+VCMPPSZrmi 2088
+VCMPPSZrmik 2089
+VCMPPSZrri 2090
+VCMPPSZrrib 2091
+VCMPPSZrribk 2092
+VCMPPSZrrik 2093
+VCMPPSrmi 2094
+VCMPPSrri 2095
+VCMPSDZrmi 2096
+VCMPSDZrmi_Int 2097
+VCMPSDZrmik_Int 2098
+VCMPSDZrri 2099
+VCMPSDZrri_Int 2100
+VCMPSDZrrib_Int 2101
+VCMPSDZrribk_Int 2102
+VCMPSDZrrik_Int 2103
+VCMPSDrmi 2104
+VCMPSDrmi_Int 2105
+VCMPSDrri 2106
+VCMPSDrri_Int 2107
+VCMPSHZrmi 2108
+VCMPSHZrmi_Int 2109
+VCMPSHZrmik_Int 2110
+VCMPSHZrri 2111
+VCMPSHZrri_Int 2112
+VCMPSHZrrib_Int 2113
+VCMPSHZrribk_Int 2114
+VCMPSHZrrik_Int 2115
+VCMPSSZrmi 2116
+VCMPSSZrmi_Int 2117
+VCMPSSZrmik_Int 2118
+VCMPSSZrri 2119
+VCMPSSZrri_Int 2120
+VCMPSSZrrib_Int 2121
+VCMPSSZrribk_Int 2122
+VCMPSSZrrik_Int 2123
+VCMPSSrmi 2124
+VCMPSSrmi_Int 2125
+VCMPSSrri 2126
+VCMPSSrri_Int 2127
+VCOMISBF 2128
+VCOMISDZrm 2129
+VCOMISDZrm_Int 2130
+VCOMISDZrr 2131
+VCOMISDZrr_Int 2132
+VCOMISDZrrb 2133
+VCOMISDrm 2134
+VCOMISDrm_Int 2135
+VCOMISDrr 2136
+VCOMISDrr_Int 2137
+VCOMISHZrm 2138
+VCOMISHZrm_Int 2139
+VCOMISHZrr 2140
+VCOMISHZrr_Int 2141
+VCOMISHZrrb 2142
+VCOMISSZrm 2143
+VCOMISSZrm_Int 2144
+VCOMISSZrr 2145
+VCOMISSZrr_Int 2146
+VCOMISSZrrb 2147
+VCOMISSrm 2148
+VCOMISSrm_Int 2149
+VCOMISSrr 2150
+VCOMISSrr_Int 2151
+VCOMPRESSPDZ 2152
+VCOMPRESSPDZmr 2153
+VCOMPRESSPDZmrk 2154
+VCOMPRESSPDZrr 2155
+VCOMPRESSPDZrrk 2156
+VCOMPRESSPDZrrkz 2157
+VCOMPRESSPSZ 2158
+VCOMPRESSPSZmr 2159
+VCOMPRESSPSZmrk 2160
+VCOMPRESSPSZrr 2161
+VCOMPRESSPSZrrk 2162
+VCOMPRESSPSZrrkz 2163
+VCOMXSDZrm_Int 2164
+VCOMXSDZrr_Int 2165
+VCOMXSDZrrb_Int 2166
+VCOMXSHZrm_Int 2167
+VCOMXSHZrr_Int 2168
+VCOMXSHZrrb_Int 2169
+VCOMXSSZrm_Int 2170
+VCOMXSSZrr_Int 2171
+VCOMXSSZrrb_Int 2172
+VCVT 2173
+VCVTBF 2174
+VCVTBIASPH 2175
+VCVTDQ 2176
+VCVTHF 2177
+VCVTNE 2178
+VCVTNEEBF 2179
+VCVTNEEPH 2180
+VCVTNEOBF 2181
+VCVTNEOPH 2182
+VCVTNEPS 2183
+VCVTPD 2184
+VCVTPH 2185
+VCVTPS 2186
+VCVTQQ 2187
+VCVTSD 2188
+VCVTSH 2189
+VCVTSI 2190
+VCVTSS 2191
+VCVTTBF 2192
+VCVTTPD 2193
+VCVTTPH 2194
+VCVTTPS 2195
+VCVTTSD 2196
+VCVTTSH 2197
+VCVTTSS 2198
+VCVTUDQ 2199
+VCVTUQQ 2200
+VCVTUSI 2201
+VCVTUW 2202
+VCVTW 2203
+VDBPSADBWZ 2204
+VDBPSADBWZrmi 2205
+VDBPSADBWZrmik 2206
+VDBPSADBWZrmikz 2207
+VDBPSADBWZrri 2208
+VDBPSADBWZrrik 2209
+VDBPSADBWZrrikz 2210
+VDIVBF 2211
+VDIVPDYrm 2212
+VDIVPDYrr 2213
+VDIVPDZ 2214
+VDIVPDZrm 2215
+VDIVPDZrmb 2216
+VDIVPDZrmbk 2217
+VDIVPDZrmbkz 2218
+VDIVPDZrmk 2219
+VDIVPDZrmkz 2220
+VDIVPDZrr 2221
+VDIVPDZrrb 2222
+VDIVPDZrrbk 2223
+VDIVPDZrrbkz 2224
+VDIVPDZrrk 2225
+VDIVPDZrrkz 2226
+VDIVPDrm 2227
+VDIVPDrr 2228
+VDIVPHZ 2229
+VDIVPHZrm 2230
+VDIVPHZrmb 2231
+VDIVPHZrmbk 2232
+VDIVPHZrmbkz 2233
+VDIVPHZrmk 2234
+VDIVPHZrmkz 2235
+VDIVPHZrr 2236
+VDIVPHZrrb 2237
+VDIVPHZrrbk 2238
+VDIVPHZrrbkz 2239
+VDIVPHZrrk 2240
+VDIVPHZrrkz 2241
+VDIVPSYrm 2242
+VDIVPSYrr 2243
+VDIVPSZ 2244
+VDIVPSZrm 2245
+VDIVPSZrmb 2246
+VDIVPSZrmbk 2247
+VDIVPSZrmbkz 2248
+VDIVPSZrmk 2249
+VDIVPSZrmkz 2250
+VDIVPSZrr 2251
+VDIVPSZrrb 2252
+VDIVPSZrrbk 2253
+VDIVPSZrrbkz 2254
+VDIVPSZrrk 2255
+VDIVPSZrrkz 2256
+VDIVPSrm 2257
+VDIVPSrr 2258
+VDIVSDZrm 2259
+VDIVSDZrm_Int 2260
+VDIVSDZrmk_Int 2261
+VDIVSDZrmkz_Int 2262
+VDIVSDZrr 2263
+VDIVSDZrr_Int 2264
+VDIVSDZrrb_Int 2265
+VDIVSDZrrbk_Int 2266
+VDIVSDZrrbkz_Int 2267
+VDIVSDZrrk_Int 2268
+VDIVSDZrrkz_Int 2269
+VDIVSDrm 2270
+VDIVSDrm_Int 2271
+VDIVSDrr 2272
+VDIVSDrr_Int 2273
+VDIVSHZrm 2274
+VDIVSHZrm_Int 2275
+VDIVSHZrmk_Int 2276
+VDIVSHZrmkz_Int 2277
+VDIVSHZrr 2278
+VDIVSHZrr_Int 2279
+VDIVSHZrrb_Int 2280
+VDIVSHZrrbk_Int 2281
+VDIVSHZrrbkz_Int 2282
+VDIVSHZrrk_Int 2283
+VDIVSHZrrkz_Int 2284
+VDIVSSZrm 2285
+VDIVSSZrm_Int 2286
+VDIVSSZrmk_Int 2287
+VDIVSSZrmkz_Int 2288
+VDIVSSZrr 2289
+VDIVSSZrr_Int 2290
+VDIVSSZrrb_Int 2291
+VDIVSSZrrbk_Int 2292
+VDIVSSZrrbkz_Int 2293
+VDIVSSZrrk_Int 2294
+VDIVSSZrrkz_Int 2295
+VDIVSSrm 2296
+VDIVSSrm_Int 2297
+VDIVSSrr 2298
+VDIVSSrr_Int 2299
+VDPBF 2300
+VDPPDrmi 2301
+VDPPDrri 2302
+VDPPHPSZ 2303
+VDPPHPSZm 2304
+VDPPHPSZmb 2305
+VDPPHPSZmbk 2306
+VDPPHPSZmbkz 2307
+VDPPHPSZmk 2308
+VDPPHPSZmkz 2309
+VDPPHPSZr 2310
+VDPPHPSZrk 2311
+VDPPHPSZrkz 2312
+VDPPSYrmi 2313
+VDPPSYrri 2314
+VDPPSrmi 2315
+VDPPSrri 2316
+VERRm 2317
+VERRr 2318
+VERWm 2319
+VERWr 2320
+VEXP 2321
+VEXPANDPDZ 2322
+VEXPANDPDZrm 2323
+VEXPANDPDZrmk 2324
+VEXPANDPDZrmkz 2325
+VEXPANDPDZrr 2326
+VEXPANDPDZrrk 2327
+VEXPANDPDZrrkz 2328
+VEXPANDPSZ 2329
+VEXPANDPSZrm 2330
+VEXPANDPSZrmk 2331
+VEXPANDPSZrmkz 2332
+VEXPANDPSZrr 2333
+VEXPANDPSZrrk 2334
+VEXPANDPSZrrkz 2335
+VEXTRACTF 2336
+VEXTRACTI 2337
+VEXTRACTPSZmri 2338
+VEXTRACTPSZrri 2339
+VEXTRACTPSmri 2340
+VEXTRACTPSrri 2341
+VFCMADDCPHZ 2342
+VFCMADDCPHZm 2343
+VFCMADDCPHZmb 2344
+VFCMADDCPHZmbk 2345
+VFCMADDCPHZmbkz 2346
+VFCMADDCPHZmk 2347
+VFCMADDCPHZmkz 2348
+VFCMADDCPHZr 2349
+VFCMADDCPHZrb 2350
+VFCMADDCPHZrbk 2351
+VFCMADDCPHZrbkz 2352
+VFCMADDCPHZrk 2353
+VFCMADDCPHZrkz 2354
+VFCMADDCSHZm 2355
+VFCMADDCSHZmk 2356
+VFCMADDCSHZmkz 2357
+VFCMADDCSHZr 2358
+VFCMADDCSHZrb 2359
+VFCMADDCSHZrbk 2360
+VFCMADDCSHZrbkz 2361
+VFCMADDCSHZrk 2362
+VFCMADDCSHZrkz 2363
+VFCMULCPHZ 2364
+VFCMULCPHZrm 2365
+VFCMULCPHZrmb 2366
+VFCMULCPHZrmbk 2367
+VFCMULCPHZrmbkz 2368
+VFCMULCPHZrmk 2369
+VFCMULCPHZrmkz 2370
+VFCMULCPHZrr 2371
+VFCMULCPHZrrb 2372
+VFCMULCPHZrrbk 2373
+VFCMULCPHZrrbkz 2374
+VFCMULCPHZrrk 2375
+VFCMULCPHZrrkz 2376
+VFCMULCSHZrm 2377
+VFCMULCSHZrmk 2378
+VFCMULCSHZrmkz 2379
+VFCMULCSHZrr 2380
+VFCMULCSHZrrb 2381
+VFCMULCSHZrrbk 2382
+VFCMULCSHZrrbkz 2383
+VFCMULCSHZrrk 2384
+VFCMULCSHZrrkz 2385
+VFIXUPIMMPDZ 2386
+VFIXUPIMMPDZrmbi 2387
+VFIXUPIMMPDZrmbik 2388
+VFIXUPIMMPDZrmbikz 2389
+VFIXUPIMMPDZrmi 2390
+VFIXUPIMMPDZrmik 2391
+VFIXUPIMMPDZrmikz 2392
+VFIXUPIMMPDZrri 2393
+VFIXUPIMMPDZrrib 2394
+VFIXUPIMMPDZrribk 2395
+VFIXUPIMMPDZrribkz 2396
+VFIXUPIMMPDZrrik 2397
+VFIXUPIMMPDZrrikz 2398
+VFIXUPIMMPSZ 2399
+VFIXUPIMMPSZrmbi 2400
+VFIXUPIMMPSZrmbik 2401
+VFIXUPIMMPSZrmbikz 2402
+VFIXUPIMMPSZrmi 2403
+VFIXUPIMMPSZrmik 2404
+VFIXUPIMMPSZrmikz 2405
+VFIXUPIMMPSZrri 2406
+VFIXUPIMMPSZrrib 2407
+VFIXUPIMMPSZrribk 2408
+VFIXUPIMMPSZrribkz 2409
+VFIXUPIMMPSZrrik 2410
+VFIXUPIMMPSZrrikz 2411
+VFIXUPIMMSDZrmi 2412
+VFIXUPIMMSDZrmik 2413
+VFIXUPIMMSDZrmikz 2414
+VFIXUPIMMSDZrri 2415
+VFIXUPIMMSDZrrib 2416
+VFIXUPIMMSDZrribk 2417
+VFIXUPIMMSDZrribkz 2418
+VFIXUPIMMSDZrrik 2419
+VFIXUPIMMSDZrrikz 2420
+VFIXUPIMMSSZrmi 2421
+VFIXUPIMMSSZrmik 2422
+VFIXUPIMMSSZrmikz 2423
+VFIXUPIMMSSZrri 2424
+VFIXUPIMMSSZrrib 2425
+VFIXUPIMMSSZrribk 2426
+VFIXUPIMMSSZrribkz 2427
+VFIXUPIMMSSZrrik 2428
+VFIXUPIMMSSZrrikz 2429
+VFMADD 2430
+VFMADDCPHZ 2431
+VFMADDCPHZm 2432
+VFMADDCPHZmb 2433
+VFMADDCPHZmbk 2434
+VFMADDCPHZmbkz 2435
+VFMADDCPHZmk 2436
+VFMADDCPHZmkz 2437
+VFMADDCPHZr 2438
+VFMADDCPHZrb 2439
+VFMADDCPHZrbk 2440
+VFMADDCPHZrbkz 2441
+VFMADDCPHZrk 2442
+VFMADDCPHZrkz 2443
+VFMADDCSHZm 2444
+VFMADDCSHZmk 2445
+VFMADDCSHZmkz 2446
+VFMADDCSHZr 2447
+VFMADDCSHZrb 2448
+VFMADDCSHZrbk 2449
+VFMADDCSHZrbkz 2450
+VFMADDCSHZrk 2451
+VFMADDCSHZrkz 2452
+VFMADDPD 2453
+VFMADDPS 2454
+VFMADDSD 2455
+VFMADDSS 2456
+VFMADDSUB 2457
+VFMADDSUBPD 2458
+VFMADDSUBPS 2459
+VFMSUB 2460
+VFMSUBADD 2461
+VFMSUBADDPD 2462
+VFMSUBADDPS 2463
+VFMSUBPD 2464
+VFMSUBPS 2465
+VFMSUBSD 2466
+VFMSUBSS 2467
+VFMULCPHZ 2468
+VFMULCPHZrm 2469
+VFMULCPHZrmb 2470
+VFMULCPHZrmbk 2471
+VFMULCPHZrmbkz 2472
+VFMULCPHZrmk 2473
+VFMULCPHZrmkz 2474
+VFMULCPHZrr 2475
+VFMULCPHZrrb 2476
+VFMULCPHZrrbk 2477
+VFMULCPHZrrbkz 2478
+VFMULCPHZrrk 2479
+VFMULCPHZrrkz 2480
+VFMULCSHZrm 2481
+VFMULCSHZrmk 2482
+VFMULCSHZrmkz 2483
+VFMULCSHZrr 2484
+VFMULCSHZrrb 2485
+VFMULCSHZrrbk 2486
+VFMULCSHZrrbkz 2487
+VFMULCSHZrrk 2488
+VFMULCSHZrrkz 2489
+VFNMADD 2490
+VFNMADDPD 2491
+VFNMADDPS 2492
+VFNMADDSD 2493
+VFNMADDSS 2494
+VFNMSUB 2495
+VFNMSUBPD 2496
+VFNMSUBPS 2497
+VFNMSUBSD 2498
+VFNMSUBSS 2499
+VFPCLASSBF 2500
+VFPCLASSPDZ 2501
+VFPCLASSPDZmbi 2502
+VFPCLASSPDZmbik 2503
+VFPCLASSPDZmi 2504
+VFPCLASSPDZmik 2505
+VFPCLASSPDZri 2506
+VFPCLASSPDZrik 2507
+VFPCLASSPHZ 2508
+VFPCLASSPHZmbi 2509
+VFPCLASSPHZmbik 2510
+VFPCLASSPHZmi 2511
+VFPCLASSPHZmik 2512
+VFPCLASSPHZri 2513
+VFPCLASSPHZrik 2514
+VFPCLASSPSZ 2515
+VFPCLASSPSZmbi 2516
+VFPCLASSPSZmbik 2517
+VFPCLASSPSZmi 2518
+VFPCLASSPSZmik 2519
+VFPCLASSPSZri 2520
+VFPCLASSPSZrik 2521
+VFPCLASSSDZmi 2522
+VFPCLASSSDZmik 2523
+VFPCLASSSDZri 2524
+VFPCLASSSDZrik 2525
+VFPCLASSSHZmi 2526
+VFPCLASSSHZmik 2527
+VFPCLASSSHZri 2528
+VFPCLASSSHZrik 2529
+VFPCLASSSSZmi 2530
+VFPCLASSSSZmik 2531
+VFPCLASSSSZri 2532
+VFPCLASSSSZrik 2533
+VFRCZPDYrm 2534
+VFRCZPDYrr 2535
+VFRCZPDrm 2536
+VFRCZPDrr 2537
+VFRCZPSYrm 2538
+VFRCZPSYrr 2539
+VFRCZPSrm 2540
+VFRCZPSrr 2541
+VFRCZSDrm 2542
+VFRCZSDrr 2543
+VFRCZSSrm 2544
+VFRCZSSrr 2545
+VGATHERDPDYrm 2546
+VGATHERDPDZ 2547
+VGATHERDPDZrm 2548
+VGATHERDPDrm 2549
+VGATHERDPSYrm 2550
+VGATHERDPSZ 2551
+VGATHERDPSZrm 2552
+VGATHERDPSrm 2553
+VGATHERPF 2554
+VGATHERQPDYrm 2555
+VGATHERQPDZ 2556
+VGATHERQPDZrm 2557
+VGATHERQPDrm 2558
+VGATHERQPSYrm 2559
+VGATHERQPSZ 2560
+VGATHERQPSZrm 2561
+VGATHERQPSrm 2562
+VGETEXPBF 2563
+VGETEXPPDZ 2564
+VGETEXPPDZm 2565
+VGETEXPPDZmb 2566
+VGETEXPPDZmbk 2567
+VGETEXPPDZmbkz 2568
+VGETEXPPDZmk 2569
+VGETEXPPDZmkz 2570
+VGETEXPPDZr 2571
+VGETEXPPDZrb 2572
+VGETEXPPDZrbk 2573
+VGETEXPPDZrbkz 2574
+VGETEXPPDZrk 2575
+VGETEXPPDZrkz 2576
+VGETEXPPHZ 2577
+VGETEXPPHZm 2578
+VGETEXPPHZmb 2579
+VGETEXPPHZmbk 2580
+VGETEXPPHZmbkz 2581
+VGETEXPPHZmk 2582
+VGETEXPPHZmkz 2583
+VGETEXPPHZr 2584
+VGETEXPPHZrb 2585
+VGETEXPPHZrbk 2586
+VGETEXPPHZrbkz 2587
+VGETEXPPHZrk 2588
+VGETEXPPHZrkz 2589
+VGETEXPPSZ 2590
+VGETEXPPSZm 2591
+VGETEXPPSZmb 2592
+VGETEXPPSZmbk 2593
+VGETEXPPSZmbkz 2594
+VGETEXPPSZmk 2595
+VGETEXPPSZmkz 2596
+VGETEXPPSZr 2597
+VGETEXPPSZrb 2598
+VGETEXPPSZrbk 2599
+VGETEXPPSZrbkz 2600
+VGETEXPPSZrk 2601
+VGETEXPPSZrkz 2602
+VGETEXPSDZm 2603
+VGETEXPSDZmk 2604
+VGETEXPSDZmkz 2605
+VGETEXPSDZr 2606
+VGETEXPSDZrb 2607
+VGETEXPSDZrbk 2608
+VGETEXPSDZrbkz 2609
+VGETEXPSDZrk 2610
+VGETEXPSDZrkz 2611
+VGETEXPSHZm 2612
+VGETEXPSHZmk 2613
+VGETEXPSHZmkz 2614
+VGETEXPSHZr 2615
+VGETEXPSHZrb 2616
+VGETEXPSHZrbk 2617
+VGETEXPSHZrbkz 2618
+VGETEXPSHZrk 2619
+VGETEXPSHZrkz 2620
+VGETEXPSSZm 2621
+VGETEXPSSZmk 2622
+VGETEXPSSZmkz 2623
+VGETEXPSSZr 2624
+VGETEXPSSZrb 2625
+VGETEXPSSZrbk 2626
+VGETEXPSSZrbkz 2627
+VGETEXPSSZrk 2628
+VGETEXPSSZrkz 2629
+VGETMANTBF 2630
+VGETMANTPDZ 2631
+VGETMANTPDZrmbi 2632
+VGETMANTPDZrmbik 2633
+VGETMANTPDZrmbikz 2634
+VGETMANTPDZrmi 2635
+VGETMANTPDZrmik 2636
+VGETMANTPDZrmikz 2637
+VGETMANTPDZrri 2638
+VGETMANTPDZrrib 2639
+VGETMANTPDZrribk 2640
+VGETMANTPDZrribkz 2641
+VGETMANTPDZrrik 2642
+VGETMANTPDZrrikz 2643
+VGETMANTPHZ 2644
+VGETMANTPHZrmbi 2645
+VGETMANTPHZrmbik 2646
+VGETMANTPHZrmbikz 2647
+VGETMANTPHZrmi 2648
+VGETMANTPHZrmik 2649
+VGETMANTPHZrmikz 2650
+VGETMANTPHZrri 2651
+VGETMANTPHZrrib 2652
+VGETMANTPHZrribk 2653
+VGETMANTPHZrribkz 2654
+VGETMANTPHZrrik 2655
+VGETMANTPHZrrikz 2656
+VGETMANTPSZ 2657
+VGETMANTPSZrmbi 2658
+VGETMANTPSZrmbik 2659
+VGETMANTPSZrmbikz 2660
+VGETMANTPSZrmi 2661
+VGETMANTPSZrmik 2662
+VGETMANTPSZrmikz 2663
+VGETMANTPSZrri 2664
+VGETMANTPSZrrib 2665
+VGETMANTPSZrribk 2666
+VGETMANTPSZrribkz 2667
+VGETMANTPSZrrik 2668
+VGETMANTPSZrrikz 2669
+VGETMANTSDZrmi 2670
+VGETMANTSDZrmik 2671
+VGETMANTSDZrmikz 2672
+VGETMANTSDZrri 2673
+VGETMANTSDZrrib 2674
+VGETMANTSDZrribk 2675
+VGETMANTSDZrribkz 2676
+VGETMANTSDZrrik 2677
+VGETMANTSDZrrikz 2678
+VGETMANTSHZrmi 2679
+VGETMANTSHZrmik 2680
+VGETMANTSHZrmikz 2681
+VGETMANTSHZrri 2682
+VGETMANTSHZrrib 2683
+VGETMANTSHZrribk 2684
+VGETMANTSHZrribkz 2685
+VGETMANTSHZrrik 2686
+VGETMANTSHZrrikz 2687
+VGETMANTSSZrmi 2688
+VGETMANTSSZrmik 2689
+VGETMANTSSZrmikz 2690
+VGETMANTSSZrri 2691
+VGETMANTSSZrrib 2692
+VGETMANTSSZrribk 2693
+VGETMANTSSZrribkz 2694
+VGETMANTSSZrrik 2695
+VGETMANTSSZrrikz 2696
+VGF 2697
+VHADDPDYrm 2698
+VHADDPDYrr 2699
+VHADDPDrm 2700
+VHADDPDrr 2701
+VHADDPSYrm 2702
+VHADDPSYrr 2703
+VHADDPSrm 2704
+VHADDPSrr 2705
+VHSUBPDYrm 2706
+VHSUBPDYrr 2707
+VHSUBPDrm 2708
+VHSUBPDrr 2709
+VHSUBPSYrm 2710
+VHSUBPSYrr 2711
+VHSUBPSrm 2712
+VHSUBPSrr 2713
+VINSERTF 2714
+VINSERTI 2715
+VINSERTPSZrmi 2716
+VINSERTPSZrri 2717
+VINSERTPSrmi 2718
+VINSERTPSrri 2719
+VLDDQUYrm 2720
+VLDDQUrm 2721
+VLDMXCSR 2722
+VMASKMOVDQU 2723
+VMASKMOVPDYmr 2724
+VMASKMOVPDYrm 2725
+VMASKMOVPDmr 2726
+VMASKMOVPDrm 2727
+VMASKMOVPSYmr 2728
+VMASKMOVPSYrm 2729
+VMASKMOVPSmr 2730
+VMASKMOVPSrm 2731
+VMAXBF 2732
+VMAXCPDYrm 2733
+VMAXCPDYrr 2734
+VMAXCPDZ 2735
+VMAXCPDZrm 2736
+VMAXCPDZrmb 2737
+VMAXCPDZrmbk 2738
+VMAXCPDZrmbkz 2739
+VMAXCPDZrmk 2740
+VMAXCPDZrmkz 2741
+VMAXCPDZrr 2742
+VMAXCPDZrrk 2743
+VMAXCPDZrrkz 2744
+VMAXCPDrm 2745
+VMAXCPDrr 2746
+VMAXCPHZ 2747
+VMAXCPHZrm 2748
+VMAXCPHZrmb 2749
+VMAXCPHZrmbk 2750
+VMAXCPHZrmbkz 2751
+VMAXCPHZrmk 2752
+VMAXCPHZrmkz 2753
+VMAXCPHZrr 2754
+VMAXCPHZrrk 2755
+VMAXCPHZrrkz 2756
+VMAXCPSYrm 2757
+VMAXCPSYrr 2758
+VMAXCPSZ 2759
+VMAXCPSZrm 2760
+VMAXCPSZrmb 2761
+VMAXCPSZrmbk 2762
+VMAXCPSZrmbkz 2763
+VMAXCPSZrmk 2764
+VMAXCPSZrmkz 2765
+VMAXCPSZrr 2766
+VMAXCPSZrrk 2767
+VMAXCPSZrrkz 2768
+VMAXCPSrm 2769
+VMAXCPSrr 2770
+VMAXCSDZrm 2771
+VMAXCSDZrr 2772
+VMAXCSDrm 2773
+VMAXCSDrr 2774
+VMAXCSHZrm 2775
+VMAXCSHZrr 2776
+VMAXCSSZrm 2777
+VMAXCSSZrr 2778
+VMAXCSSrm 2779
+VMAXCSSrr 2780
+VMAXPDYrm 2781
+VMAXPDYrr 2782
+VMAXPDZ 2783
+VMAXPDZrm 2784
+VMAXPDZrmb 2785
+VMAXPDZrmbk 2786
+VMAXPDZrmbkz 2787
+VMAXPDZrmk 2788
+VMAXPDZrmkz 2789
+VMAXPDZrr 2790
+VMAXPDZrrb 2791
+VMAXPDZrrbk 2792
+VMAXPDZrrbkz 2793
+VMAXPDZrrk 2794
+VMAXPDZrrkz 2795
+VMAXPDrm 2796
+VMAXPDrr 2797
+VMAXPHZ 2798
+VMAXPHZrm 2799
+VMAXPHZrmb 2800
+VMAXPHZrmbk 2801
+VMAXPHZrmbkz 2802
+VMAXPHZrmk 2803
+VMAXPHZrmkz 2804
+VMAXPHZrr 2805
+VMAXPHZrrb 2806
+VMAXPHZrrbk 2807
+VMAXPHZrrbkz 2808
+VMAXPHZrrk 2809
+VMAXPHZrrkz 2810
+VMAXPSYrm 2811
+VMAXPSYrr 2812
+VMAXPSZ 2813
+VMAXPSZrm 2814
+VMAXPSZrmb 2815
+VMAXPSZrmbk 2816
+VMAXPSZrmbkz 2817
+VMAXPSZrmk 2818
+VMAXPSZrmkz 2819
+VMAXPSZrr 2820
+VMAXPSZrrb 2821
+VMAXPSZrrbk 2822
+VMAXPSZrrbkz 2823
+VMAXPSZrrk 2824
+VMAXPSZrrkz 2825
+VMAXPSrm 2826
+VMAXPSrr 2827
+VMAXSDZrm 2828
+VMAXSDZrm_Int 2829
+VMAXSDZrmk_Int 2830
+VMAXSDZrmkz_Int 2831
+VMAXSDZrr 2832
+VMAXSDZrr_Int 2833
+VMAXSDZrrb_Int 2834
+VMAXSDZrrbk_Int 2835
+VMAXSDZrrbkz_Int 2836
+VMAXSDZrrk_Int 2837
+VMAXSDZrrkz_Int 2838
+VMAXSDrm 2839
+VMAXSDrm_Int 2840
+VMAXSDrr 2841
+VMAXSDrr_Int 2842
+VMAXSHZrm 2843
+VMAXSHZrm_Int 2844
+VMAXSHZrmk_Int 2845
+VMAXSHZrmkz_Int 2846
+VMAXSHZrr 2847
+VMAXSHZrr_Int 2848
+VMAXSHZrrb_Int 2849
+VMAXSHZrrbk_Int 2850
+VMAXSHZrrbkz_Int 2851
+VMAXSHZrrk_Int 2852
+VMAXSHZrrkz_Int 2853
+VMAXSSZrm 2854
+VMAXSSZrm_Int 2855
+VMAXSSZrmk_Int 2856
+VMAXSSZrmkz_Int 2857
+VMAXSSZrr 2858
+VMAXSSZrr_Int 2859
+VMAXSSZrrb_Int 2860
+VMAXSSZrrbk_Int 2861
+VMAXSSZrrbkz_Int 2862
+VMAXSSZrrk_Int 2863
+VMAXSSZrrkz_Int 2864
+VMAXSSrm 2865
+VMAXSSrm_Int 2866
+VMAXSSrr 2867
+VMAXSSrr_Int 2868
+VMCALL 2869
+VMCLEARm 2870
+VMFUNC 2871
+VMINBF 2872
+VMINCPDYrm 2873
+VMINCPDYrr 2874
+VMINCPDZ 2875
+VMINCPDZrm 2876
+VMINCPDZrmb 2877
+VMINCPDZrmbk 2878
+VMINCPDZrmbkz 2879
+VMINCPDZrmk 2880
+VMINCPDZrmkz 2881
+VMINCPDZrr 2882
+VMINCPDZrrk 2883
+VMINCPDZrrkz 2884
+VMINCPDrm 2885
+VMINCPDrr 2886
+VMINCPHZ 2887
+VMINCPHZrm 2888
+VMINCPHZrmb 2889
+VMINCPHZrmbk 2890
+VMINCPHZrmbkz 2891
+VMINCPHZrmk 2892
+VMINCPHZrmkz 2893
+VMINCPHZrr 2894
+VMINCPHZrrk 2895
+VMINCPHZrrkz 2896
+VMINCPSYrm 2897
+VMINCPSYrr 2898
+VMINCPSZ 2899
+VMINCPSZrm 2900
+VMINCPSZrmb 2901
+VMINCPSZrmbk 2902
+VMINCPSZrmbkz 2903
+VMINCPSZrmk 2904
+VMINCPSZrmkz 2905
+VMINCPSZrr 2906
+VMINCPSZrrk 2907
+VMINCPSZrrkz 2908
+VMINCPSrm 2909
+VMINCPSrr 2910
+VMINCSDZrm 2911
+VMINCSDZrr 2912
+VMINCSDrm 2913
+VMINCSDrr 2914
+VMINCSHZrm 2915
+VMINCSHZrr 2916
+VMINCSSZrm 2917
+VMINCSSZrr 2918
+VMINCSSrm 2919
+VMINCSSrr 2920
+VMINMAXBF 2921
+VMINMAXPDZ 2922
+VMINMAXPDZrmbi 2923
+VMINMAXPDZrmbik 2924
+VMINMAXPDZrmbikz 2925
+VMINMAXPDZrmi 2926
+VMINMAXPDZrmik 2927
+VMINMAXPDZrmikz 2928
+VMINMAXPDZrri 2929
+VMINMAXPDZrrib 2930
+VMINMAXPDZrribk 2931
+VMINMAXPDZrribkz 2932
+VMINMAXPDZrrik 2933
+VMINMAXPDZrrikz 2934
+VMINMAXPHZ 2935
+VMINMAXPHZrmbi 2936
+VMINMAXPHZrmbik 2937
+VMINMAXPHZrmbikz 2938
+VMINMAXPHZrmi 2939
+VMINMAXPHZrmik 2940
+VMINMAXPHZrmikz 2941
+VMINMAXPHZrri 2942
+VMINMAXPHZrrib 2943
+VMINMAXPHZrribk 2944
+VMINMAXPHZrribkz 2945
+VMINMAXPHZrrik 2946
+VMINMAXPHZrrikz 2947
+VMINMAXPSZ 2948
+VMINMAXPSZrmbi 2949
+VMINMAXPSZrmbik 2950
+VMINMAXPSZrmbikz 2951
+VMINMAXPSZrmi 2952
+VMINMAXPSZrmik 2953
+VMINMAXPSZrmikz 2954
+VMINMAXPSZrri 2955
+VMINMAXPSZrrib 2956
+VMINMAXPSZrribk 2957
+VMINMAXPSZrribkz 2958
+VMINMAXPSZrrik 2959
+VMINMAXPSZrrikz 2960
+VMINMAXSDrmi 2961
+VMINMAXSDrmi_Int 2962
+VMINMAXSDrmik_Int 2963
+VMINMAXSDrmikz_Int 2964
+VMINMAXSDrri 2965
+VMINMAXSDrri_Int 2966
+VMINMAXSDrrib_Int 2967
+VMINMAXSDrribk_Int 2968
+VMINMAXSDrribkz_Int 2969
+VMINMAXSDrrik_Int 2970
+VMINMAXSDrrikz_Int 2971
+VMINMAXSHrmi 2972
+VMINMAXSHrmi_Int 2973
+VMINMAXSHrmik_Int 2974
+VMINMAXSHrmikz_Int 2975
+VMINMAXSHrri 2976
+VMINMAXSHrri_Int 2977
+VMINMAXSHrrib_Int 2978
+VMINMAXSHrribk_Int 2979
+VMINMAXSHrribkz_Int 2980
+VMINMAXSHrrik_Int 2981
+VMINMAXSHrrikz_Int 2982
+VMINMAXSSrmi 2983
+VMINMAXSSrmi_Int 2984
+VMINMAXSSrmik_Int 2985
+VMINMAXSSrmikz_Int 2986
+VMINMAXSSrri 2987
+VMINMAXSSrri_Int 2988
+VMINMAXSSrrib_Int 2989
+VMINMAXSSrribk_Int 2990
+VMINMAXSSrribkz_Int 2991
+VMINMAXSSrrik_Int 2992
+VMINMAXSSrrikz_Int 2993
+VMINPDYrm 2994
+VMINPDYrr 2995
+VMINPDZ 2996
+VMINPDZrm 2997
+VMINPDZrmb 2998
+VMINPDZrmbk 2999
+VMINPDZrmbkz 3000
+VMINPDZrmk 3001
+VMINPDZrmkz 3002
+VMINPDZrr 3003
+VMINPDZrrb 3004
+VMINPDZrrbk 3005
+VMINPDZrrbkz 3006
+VMINPDZrrk 3007
+VMINPDZrrkz 3008
+VMINPDrm 3009
+VMINPDrr 3010
+VMINPHZ 3011
+VMINPHZrm 3012
+VMINPHZrmb 3013
+VMINPHZrmbk 3014
+VMINPHZrmbkz 3015
+VMINPHZrmk 3016
+VMINPHZrmkz 3017
+VMINPHZrr 3018
+VMINPHZrrb 3019
+VMINPHZrrbk 3020
+VMINPHZrrbkz 3021
+VMINPHZrrk 3022
+VMINPHZrrkz 3023
+VMINPSYrm 3024
+VMINPSYrr 3025
+VMINPSZ 3026
+VMINPSZrm 3027
+VMINPSZrmb 3028
+VMINPSZrmbk 3029
+VMINPSZrmbkz 3030
+VMINPSZrmk 3031
+VMINPSZrmkz 3032
+VMINPSZrr 3033
+VMINPSZrrb 3034
+VMINPSZrrbk 3035
+VMINPSZrrbkz 3036
+VMINPSZrrk 3037
+VMINPSZrrkz 3038
+VMINPSrm 3039
+VMINPSrr 3040
+VMINSDZrm 3041
+VMINSDZrm_Int 3042
+VMINSDZrmk_Int 3043
+VMINSDZrmkz_Int 3044
+VMINSDZrr 3045
+VMINSDZrr_Int 3046
+VMINSDZrrb_Int 3047
+VMINSDZrrbk_Int 3048
+VMINSDZrrbkz_Int 3049
+VMINSDZrrk_Int 3050
+VMINSDZrrkz_Int 3051
+VMINSDrm 3052
+VMINSDrm_Int 3053
+VMINSDrr 3054
+VMINSDrr_Int 3055
+VMINSHZrm 3056
+VMINSHZrm_Int 3057
+VMINSHZrmk_Int 3058
+VMINSHZrmkz_Int 3059
+VMINSHZrr 3060
+VMINSHZrr_Int 3061
+VMINSHZrrb_Int 3062
+VMINSHZrrbk_Int 3063
+VMINSHZrrbkz_Int 3064
+VMINSHZrrk_Int 3065
+VMINSHZrrkz_Int 3066
+VMINSSZrm 3067
+VMINSSZrm_Int 3068
+VMINSSZrmk_Int 3069
+VMINSSZrmkz_Int 3070
+VMINSSZrr 3071
+VMINSSZrr_Int 3072
+VMINSSZrrb_Int 3073
+VMINSSZrrbk_Int 3074
+VMINSSZrrbkz_Int 3075
+VMINSSZrrk_Int 3076
+VMINSSZrrkz_Int 3077
+VMINSSrm 3078
+VMINSSrm_Int 3079
+VMINSSrr 3080
+VMINSSrr_Int 3081
+VMLAUNCH 3082
+VMLOAD 3083
+VMMCALL 3084
+VMOV 3085
+VMOVAPDYmr 3086
+VMOVAPDYrm 3087
+VMOVAPDYrr 3088
+VMOVAPDYrr_REV 3089
+VMOVAPDZ 3090
+VMOVAPDZmr 3091
+VMOVAPDZmrk 3092
+VMOVAPDZrm 3093
+VMOVAPDZrmk 3094
+VMOVAPDZrmkz 3095
+VMOVAPDZrr 3096
+VMOVAPDZrr_REV 3097
+VMOVAPDZrrk 3098
+VMOVAPDZrrk_REV 3099
+VMOVAPDZrrkz 3100
+VMOVAPDZrrkz_REV 3101
+VMOVAPDmr 3102
+VMOVAPDrm 3103
+VMOVAPDrr 3104
+VMOVAPDrr_REV 3105
+VMOVAPSYmr 3106
+VMOVAPSYrm 3107
+VMOVAPSYrr 3108
+VMOVAPSYrr_REV 3109
+VMOVAPSZ 3110
+VMOVAPSZmr 3111
+VMOVAPSZmrk 3112
+VMOVAPSZrm 3113
+VMOVAPSZrmk 3114
+VMOVAPSZrmkz 3115
+VMOVAPSZrr 3116
+VMOVAPSZrr_REV 3117
+VMOVAPSZrrk 3118
+VMOVAPSZrrk_REV 3119
+VMOVAPSZrrkz 3120
+VMOVAPSZrrkz_REV 3121
+VMOVAPSmr 3122
+VMOVAPSrm 3123
+VMOVAPSrr 3124
+VMOVAPSrr_REV 3125
+VMOVDDUPYrm 3126
+VMOVDDUPYrr 3127
+VMOVDDUPZ 3128
+VMOVDDUPZrm 3129
+VMOVDDUPZrmk 3130
+VMOVDDUPZrmkz 3131
+VMOVDDUPZrr 3132
+VMOVDDUPZrrk 3133
+VMOVDDUPZrrkz 3134
+VMOVDDUPrm 3135
+VMOVDDUPrr 3136
+VMOVDI 3137
+VMOVDQA 3138
+VMOVDQAYmr 3139
+VMOVDQAYrm 3140
+VMOVDQAYrr 3141
+VMOVDQAYrr_REV 3142
+VMOVDQAmr 3143
+VMOVDQArm 3144
+VMOVDQArr 3145
+VMOVDQArr_REV 3146
+VMOVDQU 3147
+VMOVDQUYmr 3148
+VMOVDQUYrm 3149
+VMOVDQUYrr 3150
+VMOVDQUYrr_REV 3151
+VMOVDQUmr 3152
+VMOVDQUrm 3153
+VMOVDQUrr 3154
+VMOVDQUrr_REV 3155
+VMOVHLPSZrr 3156
+VMOVHLPSrr 3157
+VMOVHPDZ 3158
+VMOVHPDmr 3159
+VMOVHPDrm 3160
+VMOVHPSZ 3161
+VMOVHPSmr 3162
+VMOVHPSrm 3163
+VMOVLHPSZrr 3164
+VMOVLHPSrr 3165
+VMOVLPDZ 3166
+VMOVLPDmr 3167
+VMOVLPDrm 3168
+VMOVLPSZ 3169
+VMOVLPSmr 3170
+VMOVLPSrm 3171
+VMOVMSKPDYrr 3172
+VMOVMSKPDrr 3173
+VMOVMSKPSYrr 3174
+VMOVMSKPSrr 3175
+VMOVNTDQAYrm 3176
+VMOVNTDQAZ 3177
+VMOVNTDQAZrm 3178
+VMOVNTDQArm 3179
+VMOVNTDQYmr 3180
+VMOVNTDQZ 3181
+VMOVNTDQZmr 3182
+VMOVNTDQmr 3183
+VMOVNTPDYmr 3184
+VMOVNTPDZ 3185
+VMOVNTPDZmr 3186
+VMOVNTPDmr 3187
+VMOVNTPSYmr 3188
+VMOVNTPSZ 3189
+VMOVNTPSZmr 3190
+VMOVNTPSmr 3191
+VMOVPDI 3192
+VMOVPQI 3193
+VMOVPQIto 3194
+VMOVQI 3195
+VMOVRSBZ 3196
+VMOVRSBZm 3197
+VMOVRSBZmk 3198
+VMOVRSBZmkz 3199
+VMOVRSDZ 3200
+VMOVRSDZm 3201
+VMOVRSDZmk 3202
+VMOVRSDZmkz 3203
+VMOVRSQZ 3204
+VMOVRSQZm 3205
+VMOVRSQZmk 3206
+VMOVRSQZmkz 3207
+VMOVRSWZ 3208
+VMOVRSWZm 3209
+VMOVRSWZmk 3210
+VMOVRSWZmkz 3211
+VMOVSDZmr 3212
+VMOVSDZmrk 3213
+VMOVSDZrm 3214
+VMOVSDZrm_alt 3215
+VMOVSDZrmk 3216
+VMOVSDZrmkz 3217
+VMOVSDZrr 3218
+VMOVSDZrr_REV 3219
+VMOVSDZrrk 3220
+VMOVSDZrrk_REV 3221
+VMOVSDZrrkz 3222
+VMOVSDZrrkz_REV 3223
+VMOVSDmr 3224
+VMOVSDrm 3225
+VMOVSDrm_alt 3226
+VMOVSDrr 3227
+VMOVSDrr_REV 3228
+VMOVSDto 3229
+VMOVSH 3230
+VMOVSHDUPYrm 3231
+VMOVSHDUPYrr 3232
+VMOVSHDUPZ 3233
+VMOVSHDUPZrm 3234
+VMOVSHDUPZrmk 3235
+VMOVSHDUPZrmkz 3236
+VMOVSHDUPZrr 3237
+VMOVSHDUPZrrk 3238
+VMOVSHDUPZrrkz 3239
+VMOVSHDUPrm 3240
+VMOVSHDUPrr 3241
+VMOVSHZmr 3242
+VMOVSHZmrk 3243
+VMOVSHZrm 3244
+VMOVSHZrm_alt 3245
+VMOVSHZrmk 3246
+VMOVSHZrmkz 3247
+VMOVSHZrr 3248
+VMOVSHZrr_REV 3249
+VMOVSHZrrk 3250
+VMOVSHZrrk_REV 3251
+VMOVSHZrrkz 3252
+VMOVSHZrrkz_REV 3253
+VMOVSHtoW 3254
+VMOVSLDUPYrm 3255
+VMOVSLDUPYrr 3256
+VMOVSLDUPZ 3257
+VMOVSLDUPZrm 3258
+VMOVSLDUPZrmk 3259
+VMOVSLDUPZrmkz 3260
+VMOVSLDUPZrr 3261
+VMOVSLDUPZrrk 3262
+VMOVSLDUPZrrkz 3263
+VMOVSLDUPrm 3264
+VMOVSLDUPrr 3265
+VMOVSS 3266
+VMOVSSZmr 3267
+VMOVSSZmrk 3268
+VMOVSSZrm 3269
+VMOVSSZrm_alt 3270
+VMOVSSZrmk 3271
+VMOVSSZrmkz 3272
+VMOVSSZrr 3273
+VMOVSSZrr_REV 3274
+VMOVSSZrrk 3275
+VMOVSSZrrk_REV 3276
+VMOVSSZrrkz 3277
+VMOVSSZrrkz_REV 3278
+VMOVSSmr 3279
+VMOVSSrm 3280
+VMOVSSrm_alt 3281
+VMOVSSrr 3282
+VMOVSSrr_REV 3283
+VMOVUPDYmr 3284
+VMOVUPDYrm 3285
+VMOVUPDYrr 3286
+VMOVUPDYrr_REV 3287
+VMOVUPDZ 3288
+VMOVUPDZmr 3289
+VMOVUPDZmrk 3290
+VMOVUPDZrm 3291
+VMOVUPDZrmk 3292
+VMOVUPDZrmkz 3293
+VMOVUPDZrr 3294
+VMOVUPDZrr_REV 3295
+VMOVUPDZrrk 3296
+VMOVUPDZrrk_REV 3297
+VMOVUPDZrrkz 3298
+VMOVUPDZrrkz_REV 3299
+VMOVUPDmr 3300
+VMOVUPDrm 3301
+VMOVUPDrr 3302
+VMOVUPDrr_REV 3303
+VMOVUPSYmr 3304
+VMOVUPSYrm 3305
+VMOVUPSYrr 3306
+VMOVUPSYrr_REV 3307
+VMOVUPSZ 3308
+VMOVUPSZmr 3309
+VMOVUPSZmrk 3310
+VMOVUPSZrm 3311
+VMOVUPSZrmk 3312
+VMOVUPSZrmkz 3313
+VMOVUPSZrr 3314
+VMOVUPSZrr_REV 3315
+VMOVUPSZrrk 3316
+VMOVUPSZrrk_REV 3317
+VMOVUPSZrrkz 3318
+VMOVUPSZrrkz_REV 3319
+VMOVUPSmr 3320
+VMOVUPSrm 3321
+VMOVUPSrr 3322
+VMOVUPSrr_REV 3323
+VMOVW 3324
+VMOVWmr 3325
+VMOVWrm 3326
+VMOVZPDILo 3327
+VMOVZPQILo 3328
+VMOVZPWILo 3329
+VMPSADBWYrmi 3330
+VMPSADBWYrri 3331
+VMPSADBWZ 3332
+VMPSADBWZrmi 3333
+VMPSADBWZrmik 3334
+VMPSADBWZrmikz 3335
+VMPSADBWZrri 3336
+VMPSADBWZrrik 3337
+VMPSADBWZrrikz 3338
+VMPSADBWrmi 3339
+VMPSADBWrri 3340
+VMPTRLDm 3341
+VMPTRSTm 3342
+VMREAD 3343
+VMRESUME 3344
+VMRUN 3345
+VMSAVE 3346
+VMULBF 3347
+VMULPDYrm 3348
+VMULPDYrr 3349
+VMULPDZ 3350
+VMULPDZrm 3351
+VMULPDZrmb 3352
+VMULPDZrmbk 3353
+VMULPDZrmbkz 3354
+VMULPDZrmk 3355
+VMULPDZrmkz 3356
+VMULPDZrr 3357
+VMULPDZrrb 3358
+VMULPDZrrbk 3359
+VMULPDZrrbkz 3360
+VMULPDZrrk 3361
+VMULPDZrrkz 3362
+VMULPDrm 3363
+VMULPDrr 3364
+VMULPHZ 3365
+VMULPHZrm 3366
+VMULPHZrmb 3367
+VMULPHZrmbk 3368
+VMULPHZrmbkz 3369
+VMULPHZrmk 3370
+VMULPHZrmkz 3371
+VMULPHZrr 3372
+VMULPHZrrb 3373
+VMULPHZrrbk 3374
+VMULPHZrrbkz 3375
+VMULPHZrrk 3376
+VMULPHZrrkz 3377
+VMULPSYrm 3378
+VMULPSYrr 3379
+VMULPSZ 3380
+VMULPSZrm 3381
+VMULPSZrmb 3382
+VMULPSZrmbk 3383
+VMULPSZrmbkz 3384
+VMULPSZrmk 3385
+VMULPSZrmkz 3386
+VMULPSZrr 3387
+VMULPSZrrb 3388
+VMULPSZrrbk 3389
+VMULPSZrrbkz 3390
+VMULPSZrrk 3391
+VMULPSZrrkz 3392
+VMULPSrm 3393
+VMULPSrr 3394
+VMULSDZrm 3395
+VMULSDZrm_Int 3396
+VMULSDZrmk_Int 3397
+VMULSDZrmkz_Int 3398
+VMULSDZrr 3399
+VMULSDZrr_Int 3400
+VMULSDZrrb_Int 3401
+VMULSDZrrbk_Int 3402
+VMULSDZrrbkz_Int 3403
+VMULSDZrrk_Int 3404
+VMULSDZrrkz_Int 3405
+VMULSDrm 3406
+VMULSDrm_Int 3407
+VMULSDrr 3408
+VMULSDrr_Int 3409
+VMULSHZrm 3410
+VMULSHZrm_Int 3411
+VMULSHZrmk_Int 3412
+VMULSHZrmkz_Int 3413
+VMULSHZrr 3414
+VMULSHZrr_Int 3415
+VMULSHZrrb_Int 3416
+VMULSHZrrbk_Int 3417
+VMULSHZrrbkz_Int 3418
+VMULSHZrrk_Int 3419
+VMULSHZrrkz_Int 3420
+VMULSSZrm 3421
+VMULSSZrm_Int 3422
+VMULSSZrmk_Int 3423
+VMULSSZrmkz_Int 3424
+VMULSSZrr 3425
+VMULSSZrr_Int 3426
+VMULSSZrrb_Int 3427
+VMULSSZrrbk_Int 3428
+VMULSSZrrbkz_Int 3429
+VMULSSZrrk_Int 3430
+VMULSSZrrkz_Int 3431
+VMULSSrm 3432
+VMULSSrm_Int 3433
+VMULSSrr 3434
+VMULSSrr_Int 3435
+VMWRITE 3436
+VMXOFF 3437
+VMXON 3438
+VORPDYrm 3439
+VORPDYrr 3440
+VORPDZ 3441
+VORPDZrm 3442
+VORPDZrmb 3443
+VORPDZrmbk 3444
+VORPDZrmbkz 3445
+VORPDZrmk 3446
+VORPDZrmkz 3447
+VORPDZrr 3448
+VORPDZrrk 3449
+VORPDZrrkz 3450
+VORPDrm 3451
+VORPDrr 3452
+VORPSYrm 3453
+VORPSYrr 3454
+VORPSZ 3455
+VORPSZrm 3456
+VORPSZrmb 3457
+VORPSZrmbk 3458
+VORPSZrmbkz 3459
+VORPSZrmk 3460
+VORPSZrmkz 3461
+VORPSZrr 3462
+VORPSZrrk 3463
+VORPSZrrkz 3464
+VORPSrm 3465
+VORPSrr 3466
+VP 3467
+VPABSBYrm 3468
+VPABSBYrr 3469
+VPABSBZ 3470
+VPABSBZrm 3471
+VPABSBZrmk 3472
+VPABSBZrmkz 3473
+VPABSBZrr 3474
+VPABSBZrrk 3475
+VPABSBZrrkz 3476
+VPABSBrm 3477
+VPABSBrr 3478
+VPABSDYrm 3479
+VPABSDYrr 3480
+VPABSDZ 3481
+VPABSDZrm 3482
+VPABSDZrmb 3483
+VPABSDZrmbk 3484
+VPABSDZrmbkz 3485
+VPABSDZrmk 3486
+VPABSDZrmkz 3487
+VPABSDZrr 3488
+VPABSDZrrk 3489
+VPABSDZrrkz 3490
+VPABSDrm 3491
+VPABSDrr 3492
+VPABSQZ 3493
+VPABSQZrm 3494
+VPABSQZrmb 3495
+VPABSQZrmbk 3496
+VPABSQZrmbkz 3497
+VPABSQZrmk 3498
+VPABSQZrmkz 3499
+VPABSQZrr 3500
+VPABSQZrrk 3501
+VPABSQZrrkz 3502
+VPABSWYrm 3503
+VPABSWYrr 3504
+VPABSWZ 3505
+VPABSWZrm 3506
+VPABSWZrmk 3507
+VPABSWZrmkz 3508
+VPABSWZrr 3509
+VPABSWZrrk 3510
+VPABSWZrrkz 3511
+VPABSWrm 3512
+VPABSWrr 3513
+VPACKSSDWYrm 3514
+VPACKSSDWYrr 3515
+VPACKSSDWZ 3516
+VPACKSSDWZrm 3517
+VPACKSSDWZrmb 3518
+VPACKSSDWZrmbk 3519
+VPACKSSDWZrmbkz 3520
+VPACKSSDWZrmk 3521
+VPACKSSDWZrmkz 3522
+VPACKSSDWZrr 3523
+VPACKSSDWZrrk 3524
+VPACKSSDWZrrkz 3525
+VPACKSSDWrm 3526
+VPACKSSDWrr 3527
+VPACKSSWBYrm 3528
+VPACKSSWBYrr 3529
+VPACKSSWBZ 3530
+VPACKSSWBZrm 3531
+VPACKSSWBZrmk 3532
+VPACKSSWBZrmkz 3533
+VPACKSSWBZrr 3534
+VPACKSSWBZrrk 3535
+VPACKSSWBZrrkz 3536
+VPACKSSWBrm 3537
+VPACKSSWBrr 3538
+VPACKUSDWYrm 3539
+VPACKUSDWYrr 3540
+VPACKUSDWZ 3541
+VPACKUSDWZrm 3542
+VPACKUSDWZrmb 3543
+VPACKUSDWZrmbk 3544
+VPACKUSDWZrmbkz 3545
+VPACKUSDWZrmk 3546
+VPACKUSDWZrmkz 3547
+VPACKUSDWZrr 3548
+VPACKUSDWZrrk 3549
+VPACKUSDWZrrkz 3550
+VPACKUSDWrm 3551
+VPACKUSDWrr 3552
+VPACKUSWBYrm 3553
+VPACKUSWBYrr 3554
+VPACKUSWBZ 3555
+VPACKUSWBZrm 3556
+VPACKUSWBZrmk 3557
+VPACKUSWBZrmkz 3558
+VPACKUSWBZrr 3559
+VPACKUSWBZrrk 3560
+VPACKUSWBZrrkz 3561
+VPACKUSWBrm 3562
+VPACKUSWBrr 3563
+VPADDBYrm 3564
+VPADDBYrr 3565
+VPADDBZ 3566
+VPADDBZrm 3567
+VPADDBZrmk 3568
+VPADDBZrmkz 3569
+VPADDBZrr 3570
+VPADDBZrrk 3571
+VPADDBZrrkz 3572
+VPADDBrm 3573
+VPADDBrr 3574
+VPADDDYrm 3575
+VPADDDYrr 3576
+VPADDDZ 3577
+VPADDDZrm 3578
+VPADDDZrmb 3579
+VPADDDZrmbk 3580
+VPADDDZrmbkz 3581
+VPADDDZrmk 3582
+VPADDDZrmkz 3583
+VPADDDZrr 3584
+VPADDDZrrk 3585
+VPADDDZrrkz 3586
+VPADDDrm 3587
+VPADDDrr 3588
+VPADDQYrm 3589
+VPADDQYrr 3590
+VPADDQZ 3591
+VPADDQZrm 3592
+VPADDQZrmb 3593
+VPADDQZrmbk 3594
+VPADDQZrmbkz 3595
+VPADDQZrmk 3596
+VPADDQZrmkz 3597
+VPADDQZrr 3598
+VPADDQZrrk 3599
+VPADDQZrrkz 3600
+VPADDQrm 3601
+VPADDQrr 3602
+VPADDSBYrm 3603
+VPADDSBYrr 3604
+VPADDSBZ 3605
+VPADDSBZrm 3606
+VPADDSBZrmk 3607
+VPADDSBZrmkz 3608
+VPADDSBZrr 3609
+VPADDSBZrrk 3610
+VPADDSBZrrkz 3611
+VPADDSBrm 3612
+VPADDSBrr 3613
+VPADDSWYrm 3614
+VPADDSWYrr 3615
+VPADDSWZ 3616
+VPADDSWZrm 3617
+VPADDSWZrmk 3618
+VPADDSWZrmkz 3619
+VPADDSWZrr 3620
+VPADDSWZrrk 3621
+VPADDSWZrrkz 3622
+VPADDSWrm 3623
+VPADDSWrr 3624
+VPADDUSBYrm 3625
+VPADDUSBYrr 3626
+VPADDUSBZ 3627
+VPADDUSBZrm 3628
+VPADDUSBZrmk 3629
+VPADDUSBZrmkz 3630
+VPADDUSBZrr 3631
+VPADDUSBZrrk 3632
+VPADDUSBZrrkz 3633
+VPADDUSBrm 3634
+VPADDUSBrr 3635
+VPADDUSWYrm 3636
+VPADDUSWYrr 3637
+VPADDUSWZ 3638
+VPADDUSWZrm 3639
+VPADDUSWZrmk 3640
+VPADDUSWZrmkz 3641
+VPADDUSWZrr 3642
+VPADDUSWZrrk 3643
+VPADDUSWZrrkz 3644
+VPADDUSWrm 3645
+VPADDUSWrr 3646
+VPADDWYrm 3647
+VPADDWYrr 3648
+VPADDWZ 3649
+VPADDWZrm 3650
+VPADDWZrmk 3651
+VPADDWZrmkz 3652
+VPADDWZrr 3653
+VPADDWZrrk 3654
+VPADDWZrrkz 3655
+VPADDWrm 3656
+VPADDWrr 3657
+VPALIGNRYrmi 3658
+VPALIGNRYrri 3659
+VPALIGNRZ 3660
+VPALIGNRZrmi 3661
+VPALIGNRZrmik 3662
+VPALIGNRZrmikz 3663
+VPALIGNRZrri 3664
+VPALIGNRZrrik 3665
+VPALIGNRZrrikz 3666
+VPALIGNRrmi 3667
+VPALIGNRrri 3668
+VPANDDZ 3669
+VPANDDZrm 3670
+VPANDDZrmb 3671
+VPANDDZrmbk 3672
+VPANDDZrmbkz 3673
+VPANDDZrmk 3674
+VPANDDZrmkz 3675
+VPANDDZrr 3676
+VPANDDZrrk 3677
+VPANDDZrrkz 3678
+VPANDNDZ 3679
+VPANDNDZrm 3680
+VPANDNDZrmb 3681
+VPANDNDZrmbk 3682
+VPANDNDZrmbkz 3683
+VPANDNDZrmk 3684
+VPANDNDZrmkz 3685
+VPANDNDZrr 3686
+VPANDNDZrrk 3687
+VPANDNDZrrkz 3688
+VPANDNQZ 3689
+VPANDNQZrm 3690
+VPANDNQZrmb 3691
+VPANDNQZrmbk 3692
+VPANDNQZrmbkz 3693
+VPANDNQZrmk 3694
+VPANDNQZrmkz 3695
+VPANDNQZrr 3696
+VPANDNQZrrk 3697
+VPANDNQZrrkz 3698
+VPANDNYrm 3699
+VPANDNYrr 3700
+VPANDNrm 3701
+VPANDNrr 3702
+VPANDQZ 3703
+VPANDQZrm 3704
+VPANDQZrmb 3705
+VPANDQZrmbk 3706
+VPANDQZrmbkz 3707
+VPANDQZrmk 3708
+VPANDQZrmkz 3709
+VPANDQZrr 3710
+VPANDQZrrk 3711
+VPANDQZrrkz 3712
+VPANDYrm 3713
+VPANDYrr 3714
+VPANDrm 3715
+VPANDrr 3716
+VPAVGBYrm 3717
+VPAVGBYrr 3718
+VPAVGBZ 3719
+VPAVGBZrm 3720
+VPAVGBZrmk 3721
+VPAVGBZrmkz 3722
+VPAVGBZrr 3723
+VPAVGBZrrk 3724
+VPAVGBZrrkz 3725
+VPAVGBrm 3726
+VPAVGBrr 3727
+VPAVGWYrm 3728
+VPAVGWYrr 3729
+VPAVGWZ 3730
+VPAVGWZrm 3731
+VPAVGWZrmk 3732
+VPAVGWZrmkz 3733
+VPAVGWZrr 3734
+VPAVGWZrrk 3735
+VPAVGWZrrkz 3736
+VPAVGWrm 3737
+VPAVGWrr 3738
+VPBLENDDYrmi 3739
+VPBLENDDYrri 3740
+VPBLENDDrmi 3741
+VPBLENDDrri 3742
+VPBLENDMBZ 3743
+VPBLENDMBZrm 3744
+VPBLENDMBZrmk 3745
+VPBLENDMBZrmkz 3746
+VPBLENDMBZrr 3747
+VPBLENDMBZrrk 3748
+VPBLENDMBZrrkz 3749
+VPBLENDMDZ 3750
+VPBLENDMDZrm 3751
+VPBLENDMDZrmb 3752
+VPBLENDMDZrmbk 3753
+VPBLENDMDZrmbkz 3754
+VPBLENDMDZrmk 3755
+VPBLENDMDZrmkz 3756
+VPBLENDMDZrr 3757
+VPBLENDMDZrrk 3758
+VPBLENDMDZrrkz 3759
+VPBLENDMQZ 3760
+VPBLENDMQZrm 3761
+VPBLENDMQZrmb 3762
+VPBLENDMQZrmbk 3763
+VPBLENDMQZrmbkz 3764
+VPBLENDMQZrmk 3765
+VPBLENDMQZrmkz 3766
+VPBLENDMQZrr 3767
+VPBLENDMQZrrk 3768
+VPBLENDMQZrrkz 3769
+VPBLENDMWZ 3770
+VPBLENDMWZrm 3771
+VPBLENDMWZrmk 3772
+VPBLENDMWZrmkz 3773
+VPBLENDMWZrr 3774
+VPBLENDMWZrrk 3775
+VPBLENDMWZrrkz 3776
+VPBLENDVBYrmr 3777
+VPBLENDVBYrrr 3778
+VPBLENDVBrmr 3779
+VPBLENDVBrrr 3780
+VPBLENDWYrmi 3781
+VPBLENDWYrri 3782
+VPBLENDWrmi 3783
+VPBLENDWrri 3784
+VPBROADCASTBYrm 3785
+VPBROADCASTBYrr 3786
+VPBROADCASTBZ 3787
+VPBROADCASTBZrm 3788
+VPBROADCASTBZrmk 3789
+VPBROADCASTBZrmkz 3790
+VPBROADCASTBZrr 3791
+VPBROADCASTBZrrk 3792
+VPBROADCASTBZrrkz 3793
+VPBROADCASTBrZ 3794
+VPBROADCASTBrZrr 3795
+VPBROADCASTBrZrrk 3796
+VPBROADCASTBrZrrkz 3797
+VPBROADCASTBrm 3798
+VPBROADCASTBrr 3799
+VPBROADCASTDYrm 3800
+VPBROADCASTDYrr 3801
+VPBROADCASTDZ 3802
+VPBROADCASTDZrm 3803
+VPBROADCASTDZrmk 3804
+VPBROADCASTDZrmkz 3805
+VPBROADCASTDZrr 3806
+VPBROADCASTDZrrk 3807
+VPBROADCASTDZrrkz 3808
+VPBROADCASTDrZ 3809
+VPBROADCASTDrZrr 3810
+VPBROADCASTDrZrrk 3811
+VPBROADCASTDrZrrkz 3812
+VPBROADCASTDrm 3813
+VPBROADCASTDrr 3814
+VPBROADCASTMB 3815
+VPBROADCASTMW 3816
+VPBROADCASTQYrm 3817
+VPBROADCASTQYrr 3818
+VPBROADCASTQZ 3819
+VPBROADCASTQZrm 3820
+VPBROADCASTQZrmk 3821
+VPBROADCASTQZrmkz 3822
+VPBROADCASTQZrr 3823
+VPBROADCASTQZrrk 3824
+VPBROADCASTQZrrkz 3825
+VPBROADCASTQrZ 3826
+VPBROADCASTQrZrr 3827
+VPBROADCASTQrZrrk 3828
+VPBROADCASTQrZrrkz 3829
+VPBROADCASTQrm 3830
+VPBROADCASTQrr 3831
+VPBROADCASTWYrm 3832
+VPBROADCASTWYrr 3833
+VPBROADCASTWZ 3834
+VPBROADCASTWZrm 3835
+VPBROADCASTWZrmk 3836
+VPBROADCASTWZrmkz 3837
+VPBROADCASTWZrr 3838
+VPBROADCASTWZrrk 3839
+VPBROADCASTWZrrkz 3840
+VPBROADCASTWrZ 3841
+VPBROADCASTWrZrr 3842
+VPBROADCASTWrZrrk 3843
+VPBROADCASTWrZrrkz 3844
+VPBROADCASTWrm 3845
+VPBROADCASTWrr 3846
+VPCLMULQDQYrmi 3847
+VPCLMULQDQYrri 3848
+VPCLMULQDQZ 3849
+VPCLMULQDQZrmi 3850
+VPCLMULQDQZrri 3851
+VPCLMULQDQrmi 3852
+VPCLMULQDQrri 3853
+VPCMOVYrmr 3854
+VPCMOVYrrm 3855
+VPCMOVYrrr 3856
+VPCMOVYrrr_REV 3857
+VPCMOVrmr 3858
+VPCMOVrrm 3859
+VPCMOVrrr 3860
+VPCMOVrrr_REV 3861
+VPCMPBZ 3862
+VPCMPBZrmi 3863
+VPCMPBZrmik 3864
+VPCMPBZrri 3865
+VPCMPBZrrik 3866
+VPCMPDZ 3867
+VPCMPDZrmbi 3868
+VPCMPDZrmbik 3869
+VPCMPDZrmi 3870
+VPCMPDZrmik 3871
+VPCMPDZrri 3872
+VPCMPDZrrik 3873
+VPCMPEQBYrm 3874
+VPCMPEQBYrr 3875
+VPCMPEQBZ 3876
+VPCMPEQBZrm 3877
+VPCMPEQBZrmk 3878
+VPCMPEQBZrr 3879
+VPCMPEQBZrrk 3880
+VPCMPEQBrm 3881
+VPCMPEQBrr 3882
+VPCMPEQDYrm 3883
+VPCMPEQDYrr 3884
+VPCMPEQDZ 3885
+VPCMPEQDZrm 3886
+VPCMPEQDZrmb 3887
+VPCMPEQDZrmbk 3888
+VPCMPEQDZrmk 3889
+VPCMPEQDZrr 3890
+VPCMPEQDZrrk 3891
+VPCMPEQDrm 3892
+VPCMPEQDrr 3893
+VPCMPEQQYrm 3894
+VPCMPEQQYrr 3895
+VPCMPEQQZ 3896
+VPCMPEQQZrm 3897
+VPCMPEQQZrmb 3898
+VPCMPEQQZrmbk 3899
+VPCMPEQQZrmk 3900
+VPCMPEQQZrr 3901
+VPCMPEQQZrrk 3902
+VPCMPEQQrm 3903
+VPCMPEQQrr 3904
+VPCMPEQWYrm 3905
+VPCMPEQWYrr 3906
+VPCMPEQWZ 3907
+VPCMPEQWZrm 3908
+VPCMPEQWZrmk 3909
+VPCMPEQWZrr 3910
+VPCMPEQWZrrk 3911
+VPCMPEQWrm 3912
+VPCMPEQWrr 3913
+VPCMPESTRIrmi 3914
+VPCMPESTRIrri 3915
+VPCMPESTRMrmi 3916
+VPCMPESTRMrri 3917
+VPCMPGTBYrm 3918
+VPCMPGTBYrr 3919
+VPCMPGTBZ 3920
+VPCMPGTBZrm 3921
+VPCMPGTBZrmk 3922
+VPCMPGTBZrr 3923
+VPCMPGTBZrrk 3924
+VPCMPGTBrm 3925
+VPCMPGTBrr 3926
+VPCMPGTDYrm 3927
+VPCMPGTDYrr 3928
+VPCMPGTDZ 3929
+VPCMPGTDZrm 3930
+VPCMPGTDZrmb 3931
+VPCMPGTDZrmbk 3932
+VPCMPGTDZrmk 3933
+VPCMPGTDZrr 3934
+VPCMPGTDZrrk 3935
+VPCMPGTDrm 3936
+VPCMPGTDrr 3937
+VPCMPGTQYrm 3938
+VPCMPGTQYrr 3939
+VPCMPGTQZ 3940
+VPCMPGTQZrm 3941
+VPCMPGTQZrmb 3942
+VPCMPGTQZrmbk 3943
+VPCMPGTQZrmk 3944
+VPCMPGTQZrr 3945
+VPCMPGTQZrrk 3946
+VPCMPGTQrm 3947
+VPCMPGTQrr 3948
+VPCMPGTWYrm 3949
+VPCMPGTWYrr 3950
+VPCMPGTWZ 3951
+VPCMPGTWZrm 3952
+VPCMPGTWZrmk 3953
+VPCMPGTWZrr 3954
+VPCMPGTWZrrk 3955
+VPCMPGTWrm 3956
+VPCMPGTWrr 3957
+VPCMPISTRIrmi 3958
+VPCMPISTRIrri 3959
+VPCMPISTRMrmi 3960
+VPCMPISTRMrri 3961
+VPCMPQZ 3962
+VPCMPQZrmbi 3963
+VPCMPQZrmbik 3964
+VPCMPQZrmi 3965
+VPCMPQZrmik 3966
+VPCMPQZrri 3967
+VPCMPQZrrik 3968
+VPCMPUBZ 3969
+VPCMPUBZrmi 3970
+VPCMPUBZrmik 3971
+VPCMPUBZrri 3972
+VPCMPUBZrrik 3973
+VPCMPUDZ 3974
+VPCMPUDZrmbi 3975
+VPCMPUDZrmbik 3976
+VPCMPUDZrmi 3977
+VPCMPUDZrmik 3978
+VPCMPUDZrri 3979
+VPCMPUDZrrik 3980
+VPCMPUQZ 3981
+VPCMPUQZrmbi 3982
+VPCMPUQZrmbik 3983
+VPCMPUQZrmi 3984
+VPCMPUQZrmik 3985
+VPCMPUQZrri 3986
+VPCMPUQZrrik 3987
+VPCMPUWZ 3988
+VPCMPUWZrmi 3989
+VPCMPUWZrmik 3990
+VPCMPUWZrri 3991
+VPCMPUWZrrik 3992
+VPCMPWZ 3993
+VPCMPWZrmi 3994
+VPCMPWZrmik 3995
+VPCMPWZrri 3996
+VPCMPWZrrik 3997
+VPCOMBmi 3998
+VPCOMBri 3999
+VPCOMDmi 4000
+VPCOMDri 4001
+VPCOMPRESSBZ 4002
+VPCOMPRESSBZmr 4003
+VPCOMPRESSBZmrk 4004
+VPCOMPRESSBZrr 4005
+VPCOMPRESSBZrrk 4006
+VPCOMPRESSBZrrkz 4007
+VPCOMPRESSDZ 4008
+VPCOMPRESSDZmr 4009
+VPCOMPRESSDZmrk 4010
+VPCOMPRESSDZrr 4011
+VPCOMPRESSDZrrk 4012
+VPCOMPRESSDZrrkz 4013
+VPCOMPRESSQZ 4014
+VPCOMPRESSQZmr 4015
+VPCOMPRESSQZmrk 4016
+VPCOMPRESSQZrr 4017
+VPCOMPRESSQZrrk 4018
+VPCOMPRESSQZrrkz 4019
+VPCOMPRESSWZ 4020
+VPCOMPRESSWZmr 4021
+VPCOMPRESSWZmrk 4022
+VPCOMPRESSWZrr 4023
+VPCOMPRESSWZrrk 4024
+VPCOMPRESSWZrrkz 4025
+VPCOMQmi 4026
+VPCOMQri 4027
+VPCOMUBmi 4028
+VPCOMUBri 4029
+VPCOMUDmi 4030
+VPCOMUDri 4031
+VPCOMUQmi 4032
+VPCOMUQri 4033
+VPCOMUWmi 4034
+VPCOMUWri 4035
+VPCOMWmi 4036
+VPCOMWri 4037
+VPCONFLICTDZ 4038
+VPCONFLICTDZrm 4039
+VPCONFLICTDZrmb 4040
+VPCONFLICTDZrmbk 4041
+VPCONFLICTDZrmbkz 4042
+VPCONFLICTDZrmk 4043
+VPCONFLICTDZrmkz 4044
+VPCONFLICTDZrr 4045
+VPCONFLICTDZrrk 4046
+VPCONFLICTDZrrkz 4047
+VPCONFLICTQZ 4048
+VPCONFLICTQZrm 4049
+VPCONFLICTQZrmb 4050
+VPCONFLICTQZrmbk 4051
+VPCONFLICTQZrmbkz 4052
+VPCONFLICTQZrmk 4053
+VPCONFLICTQZrmkz 4054
+VPCONFLICTQZrr 4055
+VPCONFLICTQZrrk 4056
+VPCONFLICTQZrrkz 4057
+VPDPBSSDSYrm 4058
+VPDPBSSDSYrr 4059
+VPDPBSSDSZ 4060
+VPDPBSSDSZrm 4061
+VPDPBSSDSZrmb 4062
+VPDPBSSDSZrmbk 4063
+VPDPBSSDSZrmbkz 4064
+VPDPBSSDSZrmk 4065
+VPDPBSSDSZrmkz 4066
+VPDPBSSDSZrr 4067
+VPDPBSSDSZrrk 4068
+VPDPBSSDSZrrkz 4069
+VPDPBSSDSrm 4070
+VPDPBSSDSrr 4071
+VPDPBSSDYrm 4072
+VPDPBSSDYrr 4073
+VPDPBSSDZ 4074
+VPDPBSSDZrm 4075
+VPDPBSSDZrmb 4076
+VPDPBSSDZrmbk 4077
+VPDPBSSDZrmbkz 4078
+VPDPBSSDZrmk 4079
+VPDPBSSDZrmkz 4080
+VPDPBSSDZrr 4081
+VPDPBSSDZrrk 4082
+VPDPBSSDZrrkz 4083
+VPDPBSSDrm 4084
+VPDPBSSDrr 4085
+VPDPBSUDSYrm 4086
+VPDPBSUDSYrr 4087
+VPDPBSUDSZ 4088
+VPDPBSUDSZrm 4089
+VPDPBSUDSZrmb 4090
+VPDPBSUDSZrmbk 4091
+VPDPBSUDSZrmbkz 4092
+VPDPBSUDSZrmk 4093
+VPDPBSUDSZrmkz 4094
+VPDPBSUDSZrr 4095
+VPDPBSUDSZrrk 4096
+VPDPBSUDSZrrkz 4097
+VPDPBSUDSrm 4098
+VPDPBSUDSrr 4099
+VPDPBSUDYrm 4100
+VPDPBSUDYrr 4101
+VPDPBSUDZ 4102
+VPDPBSUDZrm 4103
+VPDPBSUDZrmb 4104
+VPDPBSUDZrmbk 4105
+VPDPBSUDZrmbkz 4106
+VPDPBSUDZrmk 4107
+VPDPBSUDZrmkz 4108
+VPDPBSUDZrr 4109
+VPDPBSUDZrrk 4110
+VPDPBSUDZrrkz 4111
+VPDPBSUDrm 4112
+VPDPBSUDrr 4113
+VPDPBUSDSYrm 4114
+VPDPBUSDSYrr 4115
+VPDPBUSDSZ 4116
+VPDPBUSDSZrm 4117
+VPDPBUSDSZrmb 4118
+VPDPBUSDSZrmbk 4119
+VPDPBUSDSZrmbkz 4120
+VPDPBUSDSZrmk 4121
+VPDPBUSDSZrmkz 4122
+VPDPBUSDSZrr 4123
+VPDPBUSDSZrrk 4124
+VPDPBUSDSZrrkz 4125
+VPDPBUSDSrm 4126
+VPDPBUSDSrr 4127
+VPDPBUSDYrm 4128
+VPDPBUSDYrr 4129
+VPDPBUSDZ 4130
+VPDPBUSDZrm 4131
+VPDPBUSDZrmb 4132
+VPDPBUSDZrmbk 4133
+VPDPBUSDZrmbkz 4134
+VPDPBUSDZrmk 4135
+VPDPBUSDZrmkz 4136
+VPDPBUSDZrr 4137
+VPDPBUSDZrrk 4138
+VPDPBUSDZrrkz 4139
+VPDPBUSDrm 4140
+VPDPBUSDrr 4141
+VPDPBUUDSYrm 4142
+VPDPBUUDSYrr 4143
+VPDPBUUDSZ 4144
+VPDPBUUDSZrm 4145
+VPDPBUUDSZrmb 4146
+VPDPBUUDSZrmbk 4147
+VPDPBUUDSZrmbkz 4148
+VPDPBUUDSZrmk 4149
+VPDPBUUDSZrmkz 4150
+VPDPBUUDSZrr 4151
+VPDPBUUDSZrrk 4152
+VPDPBUUDSZrrkz 4153
+VPDPBUUDSrm 4154
+VPDPBUUDSrr 4155
+VPDPBUUDYrm 4156
+VPDPBUUDYrr 4157
+VPDPBUUDZ 4158
+VPDPBUUDZrm 4159
+VPDPBUUDZrmb 4160
+VPDPBUUDZrmbk 4161
+VPDPBUUDZrmbkz 4162
+VPDPBUUDZrmk 4163
+VPDPBUUDZrmkz 4164
+VPDPBUUDZrr 4165
+VPDPBUUDZrrk 4166
+VPDPBUUDZrrkz 4167
+VPDPBUUDrm 4168
+VPDPBUUDrr 4169
+VPDPWSSDSYrm 4170
+VPDPWSSDSYrr 4171
+VPDPWSSDSZ 4172
+VPDPWSSDSZrm 4173
+VPDPWSSDSZrmb 4174
+VPDPWSSDSZrmbk 4175
+VPDPWSSDSZrmbkz 4176
+VPDPWSSDSZrmk 4177
+VPDPWSSDSZrmkz 4178
+VPDPWSSDSZrr 4179
+VPDPWSSDSZrrk 4180
+VPDPWSSDSZrrkz 4181
+VPDPWSSDSrm 4182
+VPDPWSSDSrr 4183
+VPDPWSSDYrm 4184
+VPDPWSSDYrr 4185
+VPDPWSSDZ 4186
+VPDPWSSDZrm 4187
+VPDPWSSDZrmb 4188
+VPDPWSSDZrmbk 4189
+VPDPWSSDZrmbkz 4190
+VPDPWSSDZrmk 4191
+VPDPWSSDZrmkz 4192
+VPDPWSSDZrr 4193
+VPDPWSSDZrrk 4194
+VPDPWSSDZrrkz 4195
+VPDPWSSDrm 4196
+VPDPWSSDrr 4197
+VPDPWSUDSYrm 4198
+VPDPWSUDSYrr 4199
+VPDPWSUDSZ 4200
+VPDPWSUDSZrm 4201
+VPDPWSUDSZrmb 4202
+VPDPWSUDSZrmbk 4203
+VPDPWSUDSZrmbkz 4204
+VPDPWSUDSZrmk 4205
+VPDPWSUDSZrmkz 4206
+VPDPWSUDSZrr 4207
+VPDPWSUDSZrrk 4208
+VPDPWSUDSZrrkz 4209
+VPDPWSUDSrm 4210
+VPDPWSUDSrr 4211
+VPDPWSUDYrm 4212
+VPDPWSUDYrr 4213
+VPDPWSUDZ 4214
+VPDPWSUDZrm 4215
+VPDPWSUDZrmb 4216
+VPDPWSUDZrmbk 4217
+VPDPWSUDZrmbkz 4218
+VPDPWSUDZrmk 4219
+VPDPWSUDZrmkz 4220
+VPDPWSUDZrr 4221
+VPDPWSUDZrrk 4222
+VPDPWSUDZrrkz 4223
+VPDPWSUDrm 4224
+VPDPWSUDrr 4225
+VPDPWUSDSYrm 4226
+VPDPWUSDSYrr 4227
+VPDPWUSDSZ 4228
+VPDPWUSDSZrm 4229
+VPDPWUSDSZrmb 4230
+VPDPWUSDSZrmbk 4231
+VPDPWUSDSZrmbkz 4232
+VPDPWUSDSZrmk 4233
+VPDPWUSDSZrmkz 4234
+VPDPWUSDSZrr 4235
+VPDPWUSDSZrrk 4236
+VPDPWUSDSZrrkz 4237
+VPDPWUSDSrm 4238
+VPDPWUSDSrr 4239
+VPDPWUSDYrm 4240
+VPDPWUSDYrr 4241
+VPDPWUSDZ 4242
+VPDPWUSDZrm 4243
+VPDPWUSDZrmb 4244
+VPDPWUSDZrmbk 4245
+VPDPWUSDZrmbkz 4246
+VPDPWUSDZrmk 4247
+VPDPWUSDZrmkz 4248
+VPDPWUSDZrr 4249
+VPDPWUSDZrrk 4250
+VPDPWUSDZrrkz 4251
+VPDPWUSDrm 4252
+VPDPWUSDrr 4253
+VPDPWUUDSYrm 4254
+VPDPWUUDSYrr 4255
+VPDPWUUDSZ 4256
+VPDPWUUDSZrm 4257
+VPDPWUUDSZrmb 4258
+VPDPWUUDSZrmbk 4259
+VPDPWUUDSZrmbkz 4260
+VPDPWUUDSZrmk 4261
+VPDPWUUDSZrmkz 4262
+VPDPWUUDSZrr 4263
+VPDPWUUDSZrrk 4264
+VPDPWUUDSZrrkz 4265
+VPDPWUUDSrm 4266
+VPDPWUUDSrr 4267
+VPDPWUUDYrm 4268
+VPDPWUUDYrr 4269
+VPDPWUUDZ 4270
+VPDPWUUDZrm 4271
+VPDPWUUDZrmb 4272
+VPDPWUUDZrmbk 4273
+VPDPWUUDZrmbkz 4274
+VPDPWUUDZrmk 4275
+VPDPWUUDZrmkz 4276
+VPDPWUUDZrr 4277
+VPDPWUUDZrrk 4278
+VPDPWUUDZrrkz 4279
+VPDPWUUDrm 4280
+VPDPWUUDrr 4281
+VPERM 4282
+VPERMBZ 4283
+VPERMBZrm 4284
+VPERMBZrmk 4285
+VPERMBZrmkz 4286
+VPERMBZrr 4287
+VPERMBZrrk 4288
+VPERMBZrrkz 4289
+VPERMDYrm 4290
+VPERMDYrr 4291
+VPERMDZ 4292
+VPERMDZrm 4293
+VPERMDZrmb 4294
+VPERMDZrmbk 4295
+VPERMDZrmbkz 4296
+VPERMDZrmk 4297
+VPERMDZrmkz 4298
+VPERMDZrr 4299
+VPERMDZrrk 4300
+VPERMDZrrkz 4301
+VPERMI 4302
+VPERMIL 4303
+VPERMILPDYmi 4304
+VPERMILPDYri 4305
+VPERMILPDYrm 4306
+VPERMILPDYrr 4307
+VPERMILPDZ 4308
+VPERMILPDZmbi 4309
+VPERMILPDZmbik 4310
+VPERMILPDZmbikz 4311
+VPERMILPDZmi 4312
+VPERMILPDZmik 4313
+VPERMILPDZmikz 4314
+VPERMILPDZri 4315
+VPERMILPDZrik 4316
+VPERMILPDZrikz 4317
+VPERMILPDZrm 4318
+VPERMILPDZrmb 4319
+VPERMILPDZrmbk 4320
+VPERMILPDZrmbkz 4321
+VPERMILPDZrmk 4322
+VPERMILPDZrmkz 4323
+VPERMILPDZrr 4324
+VPERMILPDZrrk 4325
+VPERMILPDZrrkz 4326
+VPERMILPDmi 4327
+VPERMILPDri 4328
+VPERMILPDrm 4329
+VPERMILPDrr 4330
+VPERMILPSYmi 4331
+VPERMILPSYri 4332
+VPERMILPSYrm 4333
+VPERMILPSYrr 4334
+VPERMILPSZ 4335
+VPERMILPSZmbi 4336
+VPERMILPSZmbik 4337
+VPERMILPSZmbikz 4338
+VPERMILPSZmi 4339
+VPERMILPSZmik 4340
+VPERMILPSZmikz 4341
+VPERMILPSZri 4342
+VPERMILPSZrik 4343
+VPERMILPSZrikz 4344
+VPERMILPSZrm 4345
+VPERMILPSZrmb 4346
+VPERMILPSZrmbk 4347
+VPERMILPSZrmbkz 4348
+VPERMILPSZrmk 4349
+VPERMILPSZrmkz 4350
+VPERMILPSZrr 4351
+VPERMILPSZrrk 4352
+VPERMILPSZrrkz 4353
+VPERMILPSmi 4354
+VPERMILPSri 4355
+VPERMILPSrm 4356
+VPERMILPSrr 4357
+VPERMPDYmi 4358
+VPERMPDYri 4359
+VPERMPDZ 4360
+VPERMPDZmbi 4361
+VPERMPDZmbik 4362
+VPERMPDZmbikz 4363
+VPERMPDZmi 4364
+VPERMPDZmik 4365
+VPERMPDZmikz 4366
+VPERMPDZri 4367
+VPERMPDZrik 4368
+VPERMPDZrikz 4369
+VPERMPDZrm 4370
+VPERMPDZrmb 4371
+VPERMPDZrmbk 4372
+VPERMPDZrmbkz 4373
+VPERMPDZrmk 4374
+VPERMPDZrmkz 4375
+VPERMPDZrr 4376
+VPERMPDZrrk 4377
+VPERMPDZrrkz 4378
+VPERMPSYrm 4379
+VPERMPSYrr 4380
+VPERMPSZ 4381
+VPERMPSZrm 4382
+VPERMPSZrmb 4383
+VPERMPSZrmbk 4384
+VPERMPSZrmbkz 4385
+VPERMPSZrmk 4386
+VPERMPSZrmkz 4387
+VPERMPSZrr 4388
+VPERMPSZrrk 4389
+VPERMPSZrrkz 4390
+VPERMQYmi 4391
+VPERMQYri 4392
+VPERMQZ 4393
+VPERMQZmbi 4394
+VPERMQZmbik 4395
+VPERMQZmbikz 4396
+VPERMQZmi 4397
+VPERMQZmik 4398
+VPERMQZmikz 4399
+VPERMQZri 4400
+VPERMQZrik 4401
+VPERMQZrikz 4402
+VPERMQZrm 4403
+VPERMQZrmb 4404
+VPERMQZrmbk 4405
+VPERMQZrmbkz 4406
+VPERMQZrmk 4407
+VPERMQZrmkz 4408
+VPERMQZrr 4409
+VPERMQZrrk 4410
+VPERMQZrrkz 4411
+VPERMT 4412
+VPERMWZ 4413
+VPERMWZrm 4414
+VPERMWZrmk 4415
+VPERMWZrmkz 4416
+VPERMWZrr 4417
+VPERMWZrrk 4418
+VPERMWZrrkz 4419
+VPEXPANDBZ 4420
+VPEXPANDBZrm 4421
+VPEXPANDBZrmk 4422
+VPEXPANDBZrmkz 4423
+VPEXPANDBZrr 4424
+VPEXPANDBZrrk 4425
+VPEXPANDBZrrkz 4426
+VPEXPANDDZ 4427
+VPEXPANDDZrm 4428
+VPEXPANDDZrmk 4429
+VPEXPANDDZrmkz 4430
+VPEXPANDDZrr 4431
+VPEXPANDDZrrk 4432
+VPEXPANDDZrrkz 4433
+VPEXPANDQZ 4434
+VPEXPANDQZrm 4435
+VPEXPANDQZrmk 4436
+VPEXPANDQZrmkz 4437
+VPEXPANDQZrr 4438
+VPEXPANDQZrrk 4439
+VPEXPANDQZrrkz 4440
+VPEXPANDWZ 4441
+VPEXPANDWZrm 4442
+VPEXPANDWZrmk 4443
+VPEXPANDWZrmkz 4444
+VPEXPANDWZrr 4445
+VPEXPANDWZrrk 4446
+VPEXPANDWZrrkz 4447
+VPEXTRBZmri 4448
+VPEXTRBZrri 4449
+VPEXTRBmri 4450
+VPEXTRBrri 4451
+VPEXTRDZmri 4452
+VPEXTRDZrri 4453
+VPEXTRDmri 4454
+VPEXTRDrri 4455
+VPEXTRQZmri 4456
+VPEXTRQZrri 4457
+VPEXTRQmri 4458
+VPEXTRQrri 4459
+VPEXTRWZmri 4460
+VPEXTRWZrri 4461
+VPEXTRWZrri_REV 4462
+VPEXTRWmri 4463
+VPEXTRWrri 4464
+VPEXTRWrri_REV 4465
+VPGATHERDDYrm 4466
+VPGATHERDDZ 4467
+VPGATHERDDZrm 4468
+VPGATHERDDrm 4469
+VPGATHERDQYrm 4470
+VPGATHERDQZ 4471
+VPGATHERDQZrm 4472
+VPGATHERDQrm 4473
+VPGATHERQDYrm 4474
+VPGATHERQDZ 4475
+VPGATHERQDZrm 4476
+VPGATHERQDrm 4477
+VPGATHERQQYrm 4478
+VPGATHERQQZ 4479
+VPGATHERQQZrm 4480
+VPGATHERQQrm 4481
+VPHADDBDrm 4482
+VPHADDBDrr 4483
+VPHADDBQrm 4484
+VPHADDBQrr 4485
+VPHADDBWrm 4486
+VPHADDBWrr 4487
+VPHADDDQrm 4488
+VPHADDDQrr 4489
+VPHADDDYrm 4490
+VPHADDDYrr 4491
+VPHADDDrm 4492
+VPHADDDrr 4493
+VPHADDSWYrm 4494
+VPHADDSWYrr 4495
+VPHADDSWrm 4496
+VPHADDSWrr 4497
+VPHADDUBDrm 4498
+VPHADDUBDrr 4499
+VPHADDUBQrm 4500
+VPHADDUBQrr 4501
+VPHADDUBWrm 4502
+VPHADDUBWrr 4503
+VPHADDUDQrm 4504
+VPHADDUDQrr 4505
+VPHADDUWDrm 4506
+VPHADDUWDrr 4507
+VPHADDUWQrm 4508
+VPHADDUWQrr 4509
+VPHADDWDrm 4510
+VPHADDWDrr 4511
+VPHADDWQrm 4512
+VPHADDWQrr 4513
+VPHADDWYrm 4514
+VPHADDWYrr 4515
+VPHADDWrm 4516
+VPHADDWrr 4517
+VPHMINPOSUWrm 4518
+VPHMINPOSUWrr 4519
+VPHSUBBWrm 4520
+VPHSUBBWrr 4521
+VPHSUBDQrm 4522
+VPHSUBDQrr 4523
+VPHSUBDYrm 4524
+VPHSUBDYrr 4525
+VPHSUBDrm 4526
+VPHSUBDrr 4527
+VPHSUBSWYrm 4528
+VPHSUBSWYrr 4529
+VPHSUBSWrm 4530
+VPHSUBSWrr 4531
+VPHSUBWDrm 4532
+VPHSUBWDrr 4533
+VPHSUBWYrm 4534
+VPHSUBWYrr 4535
+VPHSUBWrm 4536
+VPHSUBWrr 4537
+VPINSRBZrmi 4538
+VPINSRBZrri 4539
+VPINSRBrmi 4540
+VPINSRBrri 4541
+VPINSRDZrmi 4542
+VPINSRDZrri 4543
+VPINSRDrmi 4544
+VPINSRDrri 4545
+VPINSRQZrmi 4546
+VPINSRQZrri 4547
+VPINSRQrmi 4548
+VPINSRQrri 4549
+VPINSRWZrmi 4550
+VPINSRWZrri 4551
+VPINSRWrmi 4552
+VPINSRWrri 4553
+VPLZCNTDZ 4554
+VPLZCNTDZrm 4555
+VPLZCNTDZrmb 4556
+VPLZCNTDZrmbk 4557
+VPLZCNTDZrmbkz 4558
+VPLZCNTDZrmk 4559
+VPLZCNTDZrmkz 4560
+VPLZCNTDZrr 4561
+VPLZCNTDZrrk 4562
+VPLZCNTDZrrkz 4563
+VPLZCNTQZ 4564
+VPLZCNTQZrm 4565
+VPLZCNTQZrmb 4566
+VPLZCNTQZrmbk 4567
+VPLZCNTQZrmbkz 4568
+VPLZCNTQZrmk 4569
+VPLZCNTQZrmkz 4570
+VPLZCNTQZrr 4571
+VPLZCNTQZrrk 4572
+VPLZCNTQZrrkz 4573
+VPMACSDDrm 4574
+VPMACSDDrr 4575
+VPMACSDQHrm 4576
+VPMACSDQHrr 4577
+VPMACSDQLrm 4578
+VPMACSDQLrr 4579
+VPMACSSDDrm 4580
+VPMACSSDDrr 4581
+VPMACSSDQHrm 4582
+VPMACSSDQHrr 4583
+VPMACSSDQLrm 4584
+VPMACSSDQLrr 4585
+VPMACSSWDrm 4586
+VPMACSSWDrr 4587
+VPMACSSWWrm 4588
+VPMACSSWWrr 4589
+VPMACSWDrm 4590
+VPMACSWDrr 4591
+VPMACSWWrm 4592
+VPMACSWWrr 4593
+VPMADCSSWDrm 4594
+VPMADCSSWDrr 4595
+VPMADCSWDrm 4596
+VPMADCSWDrr 4597
+VPMADD 4598
+VPMADDUBSWYrm 4599
+VPMADDUBSWYrr 4600
+VPMADDUBSWZ 4601
+VPMADDUBSWZrm 4602
+VPMADDUBSWZrmk 4603
+VPMADDUBSWZrmkz 4604
+VPMADDUBSWZrr 4605
+VPMADDUBSWZrrk 4606
+VPMADDUBSWZrrkz 4607
+VPMADDUBSWrm 4608
+VPMADDUBSWrr 4609
+VPMADDWDYrm 4610
+VPMADDWDYrr 4611
+VPMADDWDZ 4612
+VPMADDWDZrm 4613
+VPMADDWDZrmk 4614
+VPMADDWDZrmkz 4615
+VPMADDWDZrr 4616
+VPMADDWDZrrk 4617
+VPMADDWDZrrkz 4618
+VPMADDWDrm 4619
+VPMADDWDrr 4620
+VPMASKMOVDYmr 4621
+VPMASKMOVDYrm 4622
+VPMASKMOVDmr 4623
+VPMASKMOVDrm 4624
+VPMASKMOVQYmr 4625
+VPMASKMOVQYrm 4626
+VPMASKMOVQmr 4627
+VPMASKMOVQrm 4628
+VPMAXSBYrm 4629
+VPMAXSBYrr 4630
+VPMAXSBZ 4631
+VPMAXSBZrm 4632
+VPMAXSBZrmk 4633
+VPMAXSBZrmkz 4634
+VPMAXSBZrr 4635
+VPMAXSBZrrk 4636
+VPMAXSBZrrkz 4637
+VPMAXSBrm 4638
+VPMAXSBrr 4639
+VPMAXSDYrm 4640
+VPMAXSDYrr 4641
+VPMAXSDZ 4642
+VPMAXSDZrm 4643
+VPMAXSDZrmb 4644
+VPMAXSDZrmbk 4645
+VPMAXSDZrmbkz 4646
+VPMAXSDZrmk 4647
+VPMAXSDZrmkz 4648
+VPMAXSDZrr 4649
+VPMAXSDZrrk 4650
+VPMAXSDZrrkz 4651
+VPMAXSDrm 4652
+VPMAXSDrr 4653
+VPMAXSQZ 4654
+VPMAXSQZrm 4655
+VPMAXSQZrmb 4656
+VPMAXSQZrmbk 4657
+VPMAXSQZrmbkz 4658
+VPMAXSQZrmk 4659
+VPMAXSQZrmkz 4660
+VPMAXSQZrr 4661
+VPMAXSQZrrk 4662
+VPMAXSQZrrkz 4663
+VPMAXSWYrm 4664
+VPMAXSWYrr 4665
+VPMAXSWZ 4666
+VPMAXSWZrm 4667
+VPMAXSWZrmk 4668
+VPMAXSWZrmkz 4669
+VPMAXSWZrr 4670
+VPMAXSWZrrk 4671
+VPMAXSWZrrkz 4672
+VPMAXSWrm 4673
+VPMAXSWrr 4674
+VPMAXUBYrm 4675
+VPMAXUBYrr 4676
+VPMAXUBZ 4677
+VPMAXUBZrm 4678
+VPMAXUBZrmk 4679
+VPMAXUBZrmkz 4680
+VPMAXUBZrr 4681
+VPMAXUBZrrk 4682
+VPMAXUBZrrkz 4683
+VPMAXUBrm 4684
+VPMAXUBrr 4685
+VPMAXUDYrm 4686
+VPMAXUDYrr 4687
+VPMAXUDZ 4688
+VPMAXUDZrm 4689
+VPMAXUDZrmb 4690
+VPMAXUDZrmbk 4691
+VPMAXUDZrmbkz 4692
+VPMAXUDZrmk 4693
+VPMAXUDZrmkz 4694
+VPMAXUDZrr 4695
+VPMAXUDZrrk 4696
+VPMAXUDZrrkz 4697
+VPMAXUDrm 4698
+VPMAXUDrr 4699
+VPMAXUQZ 4700
+VPMAXUQZrm 4701
+VPMAXUQZrmb 4702
+VPMAXUQZrmbk 4703
+VPMAXUQZrmbkz 4704
+VPMAXUQZrmk 4705
+VPMAXUQZrmkz 4706
+VPMAXUQZrr 4707
+VPMAXUQZrrk 4708
+VPMAXUQZrrkz 4709
+VPMAXUWYrm 4710
+VPMAXUWYrr 4711
+VPMAXUWZ 4712
+VPMAXUWZrm 4713
+VPMAXUWZrmk 4714
+VPMAXUWZrmkz 4715
+VPMAXUWZrr 4716
+VPMAXUWZrrk 4717
+VPMAXUWZrrkz 4718
+VPMAXUWrm 4719
+VPMAXUWrr 4720
+VPMINSBYrm 4721
+VPMINSBYrr 4722
+VPMINSBZ 4723
+VPMINSBZrm 4724
+VPMINSBZrmk 4725
+VPMINSBZrmkz 4726
+VPMINSBZrr 4727
+VPMINSBZrrk 4728
+VPMINSBZrrkz 4729
+VPMINSBrm 4730
+VPMINSBrr 4731
+VPMINSDYrm 4732
+VPMINSDYrr 4733
+VPMINSDZ 4734
+VPMINSDZrm 4735
+VPMINSDZrmb 4736
+VPMINSDZrmbk 4737
+VPMINSDZrmbkz 4738
+VPMINSDZrmk 4739
+VPMINSDZrmkz 4740
+VPMINSDZrr 4741
+VPMINSDZrrk 4742
+VPMINSDZrrkz 4743
+VPMINSDrm 4744
+VPMINSDrr 4745
+VPMINSQZ 4746
+VPMINSQZrm 4747
+VPMINSQZrmb 4748
+VPMINSQZrmbk 4749
+VPMINSQZrmbkz 4750
+VPMINSQZrmk 4751
+VPMINSQZrmkz 4752
+VPMINSQZrr 4753
+VPMINSQZrrk 4754
+VPMINSQZrrkz 4755
+VPMINSWYrm 4756
+VPMINSWYrr 4757
+VPMINSWZ 4758
+VPMINSWZrm 4759
+VPMINSWZrmk 4760
+VPMINSWZrmkz 4761
+VPMINSWZrr 4762
+VPMINSWZrrk 4763
+VPMINSWZrrkz 4764
+VPMINSWrm 4765
+VPMINSWrr 4766
+VPMINUBYrm 4767
+VPMINUBYrr 4768
+VPMINUBZ 4769
+VPMINUBZrm 4770
+VPMINUBZrmk 4771
+VPMINUBZrmkz 4772
+VPMINUBZrr 4773
+VPMINUBZrrk 4774
+VPMINUBZrrkz 4775
+VPMINUBrm 4776
+VPMINUBrr 4777
+VPMINUDYrm 4778
+VPMINUDYrr 4779
+VPMINUDZ 4780
+VPMINUDZrm 4781
+VPMINUDZrmb 4782
+VPMINUDZrmbk 4783
+VPMINUDZrmbkz 4784
+VPMINUDZrmk 4785
+VPMINUDZrmkz 4786
+VPMINUDZrr 4787
+VPMINUDZrrk 4788
+VPMINUDZrrkz 4789
+VPMINUDrm 4790
+VPMINUDrr 4791
+VPMINUQZ 4792
+VPMINUQZrm 4793
+VPMINUQZrmb 4794
+VPMINUQZrmbk 4795
+VPMINUQZrmbkz 4796
+VPMINUQZrmk 4797
+VPMINUQZrmkz 4798
+VPMINUQZrr 4799
+VPMINUQZrrk 4800
+VPMINUQZrrkz 4801
+VPMINUWYrm 4802
+VPMINUWYrr 4803
+VPMINUWZ 4804
+VPMINUWZrm 4805
+VPMINUWZrmk 4806
+VPMINUWZrmkz 4807
+VPMINUWZrr 4808
+VPMINUWZrrk 4809
+VPMINUWZrrkz 4810
+VPMINUWrm 4811
+VPMINUWrr 4812
+VPMOVB 4813
+VPMOVD 4814
+VPMOVDBZ 4815
+VPMOVDBZmr 4816
+VPMOVDBZmrk 4817
+VPMOVDBZrr 4818
+VPMOVDBZrrk 4819
+VPMOVDBZrrkz 4820
+VPMOVDWZ 4821
+VPMOVDWZmr 4822
+VPMOVDWZmrk 4823
+VPMOVDWZrr 4824
+VPMOVDWZrrk 4825
+VPMOVDWZrrkz 4826
+VPMOVM 4827
+VPMOVMSKBYrr 4828
+VPMOVMSKBrr 4829
+VPMOVQ 4830
+VPMOVQBZ 4831
+VPMOVQBZmr 4832
+VPMOVQBZmrk 4833
+VPMOVQBZrr 4834
+VPMOVQBZrrk 4835
+VPMOVQBZrrkz 4836
+VPMOVQDZ 4837
+VPMOVQDZmr 4838
+VPMOVQDZmrk 4839
+VPMOVQDZrr 4840
+VPMOVQDZrrk 4841
+VPMOVQDZrrkz 4842
+VPMOVQWZ 4843
+VPMOVQWZmr 4844
+VPMOVQWZmrk 4845
+VPMOVQWZrr 4846
+VPMOVQWZrrk 4847
+VPMOVQWZrrkz 4848
+VPMOVSDBZ 4849
+VPMOVSDBZmr 4850
+VPMOVSDBZmrk 4851
+VPMOVSDBZrr 4852
+VPMOVSDBZrrk 4853
+VPMOVSDBZrrkz 4854
+VPMOVSDWZ 4855
+VPMOVSDWZmr 4856
+VPMOVSDWZmrk 4857
+VPMOVSDWZrr 4858
+VPMOVSDWZrrk 4859
+VPMOVSDWZrrkz 4860
+VPMOVSQBZ 4861
+VPMOVSQBZmr 4862
+VPMOVSQBZmrk 4863
+VPMOVSQBZrr 4864
+VPMOVSQBZrrk 4865
+VPMOVSQBZrrkz 4866
+VPMOVSQDZ 4867
+VPMOVSQDZmr 4868
+VPMOVSQDZmrk 4869
+VPMOVSQDZrr 4870
+VPMOVSQDZrrk 4871
+VPMOVSQDZrrkz 4872
+VPMOVSQWZ 4873
+VPMOVSQWZmr 4874
+VPMOVSQWZmrk 4875
+VPMOVSQWZrr 4876
+VPMOVSQWZrrk 4877
+VPMOVSQWZrrkz 4878
+VPMOVSWBZ 4879
+VPMOVSWBZmr 4880
+VPMOVSWBZmrk 4881
+VPMOVSWBZrr 4882
+VPMOVSWBZrrk 4883
+VPMOVSWBZrrkz 4884
+VPMOVSXBDYrm 4885
+VPMOVSXBDYrr 4886
+VPMOVSXBDZ 4887
+VPMOVSXBDZrm 4888
+VPMOVSXBDZrmk 4889
+VPMOVSXBDZrmkz 4890
+VPMOVSXBDZrr 4891
+VPMOVSXBDZrrk 4892
+VPMOVSXBDZrrkz 4893
+VPMOVSXBDrm 4894
+VPMOVSXBDrr 4895
+VPMOVSXBQYrm 4896
+VPMOVSXBQYrr 4897
+VPMOVSXBQZ 4898
+VPMOVSXBQZrm 4899
+VPMOVSXBQZrmk 4900
+VPMOVSXBQZrmkz 4901
+VPMOVSXBQZrr 4902
+VPMOVSXBQZrrk 4903
+VPMOVSXBQZrrkz 4904
+VPMOVSXBQrm 4905
+VPMOVSXBQrr 4906
+VPMOVSXBWYrm 4907
+VPMOVSXBWYrr 4908
+VPMOVSXBWZ 4909
+VPMOVSXBWZrm 4910
+VPMOVSXBWZrmk 4911
+VPMOVSXBWZrmkz 4912
+VPMOVSXBWZrr 4913
+VPMOVSXBWZrrk 4914
+VPMOVSXBWZrrkz 4915
+VPMOVSXBWrm 4916
+VPMOVSXBWrr 4917
+VPMOVSXDQYrm 4918
+VPMOVSXDQYrr 4919
+VPMOVSXDQZ 4920
+VPMOVSXDQZrm 4921
+VPMOVSXDQZrmk 4922
+VPMOVSXDQZrmkz 4923
+VPMOVSXDQZrr 4924
+VPMOVSXDQZrrk 4925
+VPMOVSXDQZrrkz 4926
+VPMOVSXDQrm 4927
+VPMOVSXDQrr 4928
+VPMOVSXWDYrm 4929
+VPMOVSXWDYrr 4930
+VPMOVSXWDZ 4931
+VPMOVSXWDZrm 4932
+VPMOVSXWDZrmk 4933
+VPMOVSXWDZrmkz 4934
+VPMOVSXWDZrr 4935
+VPMOVSXWDZrrk 4936
+VPMOVSXWDZrrkz 4937
+VPMOVSXWDrm 4938
+VPMOVSXWDrr 4939
+VPMOVSXWQYrm 4940
+VPMOVSXWQYrr 4941
+VPMOVSXWQZ 4942
+VPMOVSXWQZrm 4943
+VPMOVSXWQZrmk 4944
+VPMOVSXWQZrmkz 4945
+VPMOVSXWQZrr 4946
+VPMOVSXWQZrrk 4947
+VPMOVSXWQZrrkz 4948
+VPMOVSXWQrm 4949
+VPMOVSXWQrr 4950
+VPMOVUSDBZ 4951
+VPMOVUSDBZmr 4952
+VPMOVUSDBZmrk 4953
+VPMOVUSDBZrr 4954
+VPMOVUSDBZrrk 4955
+VPMOVUSDBZrrkz 4956
+VPMOVUSDWZ 4957
+VPMOVUSDWZmr 4958
+VPMOVUSDWZmrk 4959
+VPMOVUSDWZrr 4960
+VPMOVUSDWZrrk 4961
+VPMOVUSDWZrrkz 4962
+VPMOVUSQBZ 4963
+VPMOVUSQBZmr 4964
+VPMOVUSQBZmrk 4965
+VPMOVUSQBZrr 4966
+VPMOVUSQBZrrk 4967
+VPMOVUSQBZrrkz 4968
+VPMOVUSQDZ 4969
+VPMOVUSQDZmr 4970
+VPMOVUSQDZmrk 4971
+VPMOVUSQDZrr 4972
+VPMOVUSQDZrrk 4973
+VPMOVUSQDZrrkz 4974
+VPMOVUSQWZ 4975
+VPMOVUSQWZmr 4976
+VPMOVUSQWZmrk 4977
+VPMOVUSQWZrr 4978
+VPMOVUSQWZrrk 4979
+VPMOVUSQWZrrkz 4980
+VPMOVUSWBZ 4981
+VPMOVUSWBZmr 4982
+VPMOVUSWBZmrk 4983
+VPMOVUSWBZrr 4984
+VPMOVUSWBZrrk 4985
+VPMOVUSWBZrrkz 4986
+VPMOVW 4987
+VPMOVWBZ 4988
+VPMOVWBZmr 4989
+VPMOVWBZmrk 4990
+VPMOVWBZrr 4991
+VPMOVWBZrrk 4992
+VPMOVWBZrrkz 4993
+VPMOVZXBDYrm 4994
+VPMOVZXBDYrr 4995
+VPMOVZXBDZ 4996
+VPMOVZXBDZrm 4997
+VPMOVZXBDZrmk 4998
+VPMOVZXBDZrmkz 4999
+VPMOVZXBDZrr 5000
+VPMOVZXBDZrrk 5001
+VPMOVZXBDZrrkz 5002
+VPMOVZXBDrm 5003
+VPMOVZXBDrr 5004
+VPMOVZXBQYrm 5005
+VPMOVZXBQYrr 5006
+VPMOVZXBQZ 5007
+VPMOVZXBQZrm 5008
+VPMOVZXBQZrmk 5009
+VPMOVZXBQZrmkz 5010
+VPMOVZXBQZrr 5011
+VPMOVZXBQZrrk 5012
+VPMOVZXBQZrrkz 5013
+VPMOVZXBQrm 5014
+VPMOVZXBQrr 5015
+VPMOVZXBWYrm 5016
+VPMOVZXBWYrr 5017
+VPMOVZXBWZ 5018
+VPMOVZXBWZrm 5019
+VPMOVZXBWZrmk 5020
+VPMOVZXBWZrmkz 5021
+VPMOVZXBWZrr 5022
+VPMOVZXBWZrrk 5023
+VPMOVZXBWZrrkz 5024
+VPMOVZXBWrm 5025
+VPMOVZXBWrr 5026
+VPMOVZXDQYrm 5027
+VPMOVZXDQYrr 5028
+VPMOVZXDQZ 5029
+VPMOVZXDQZrm 5030
+VPMOVZXDQZrmk 5031
+VPMOVZXDQZrmkz 5032
+VPMOVZXDQZrr 5033
+VPMOVZXDQZrrk 5034
+VPMOVZXDQZrrkz 5035
+VPMOVZXDQrm 5036
+VPMOVZXDQrr 5037
+VPMOVZXWDYrm 5038
+VPMOVZXWDYrr 5039
+VPMOVZXWDZ 5040
+VPMOVZXWDZrm 5041
+VPMOVZXWDZrmk 5042
+VPMOVZXWDZrmkz 5043
+VPMOVZXWDZrr 5044
+VPMOVZXWDZrrk 5045
+VPMOVZXWDZrrkz 5046
+VPMOVZXWDrm 5047
+VPMOVZXWDrr 5048
+VPMOVZXWQYrm 5049
+VPMOVZXWQYrr 5050
+VPMOVZXWQZ 5051
+VPMOVZXWQZrm 5052
+VPMOVZXWQZrmk 5053
+VPMOVZXWQZrmkz 5054
+VPMOVZXWQZrr 5055
+VPMOVZXWQZrrk 5056
+VPMOVZXWQZrrkz 5057
+VPMOVZXWQrm 5058
+VPMOVZXWQrr 5059
+VPMULDQYrm 5060
+VPMULDQYrr 5061
+VPMULDQZ 5062
+VPMULDQZrm 5063
+VPMULDQZrmb 5064
+VPMULDQZrmbk 5065
+VPMULDQZrmbkz 5066
+VPMULDQZrmk 5067
+VPMULDQZrmkz 5068
+VPMULDQZrr 5069
+VPMULDQZrrk 5070
+VPMULDQZrrkz 5071
+VPMULDQrm 5072
+VPMULDQrr 5073
+VPMULHRSWYrm 5074
+VPMULHRSWYrr 5075
+VPMULHRSWZ 5076
+VPMULHRSWZrm 5077
+VPMULHRSWZrmk 5078
+VPMULHRSWZrmkz 5079
+VPMULHRSWZrr 5080
+VPMULHRSWZrrk 5081
+VPMULHRSWZrrkz 5082
+VPMULHRSWrm 5083
+VPMULHRSWrr 5084
+VPMULHUWYrm 5085
+VPMULHUWYrr 5086
+VPMULHUWZ 5087
+VPMULHUWZrm 5088
+VPMULHUWZrmk 5089
+VPMULHUWZrmkz 5090
+VPMULHUWZrr 5091
+VPMULHUWZrrk 5092
+VPMULHUWZrrkz 5093
+VPMULHUWrm 5094
+VPMULHUWrr 5095
+VPMULHWYrm 5096
+VPMULHWYrr 5097
+VPMULHWZ 5098
+VPMULHWZrm 5099
+VPMULHWZrmk 5100
+VPMULHWZrmkz 5101
+VPMULHWZrr 5102
+VPMULHWZrrk 5103
+VPMULHWZrrkz 5104
+VPMULHWrm 5105
+VPMULHWrr 5106
+VPMULLDYrm 5107
+VPMULLDYrr 5108
+VPMULLDZ 5109
+VPMULLDZrm 5110
+VPMULLDZrmb 5111
+VPMULLDZrmbk 5112
+VPMULLDZrmbkz 5113
+VPMULLDZrmk 5114
+VPMULLDZrmkz 5115
+VPMULLDZrr 5116
+VPMULLDZrrk 5117
+VPMULLDZrrkz 5118
+VPMULLDrm 5119
+VPMULLDrr 5120
+VPMULLQZ 5121
+VPMULLQZrm 5122
+VPMULLQZrmb 5123
+VPMULLQZrmbk 5124
+VPMULLQZrmbkz 5125
+VPMULLQZrmk 5126
+VPMULLQZrmkz 5127
+VPMULLQZrr 5128
+VPMULLQZrrk 5129
+VPMULLQZrrkz 5130
+VPMULLWYrm 5131
+VPMULLWYrr 5132
+VPMULLWZ 5133
+VPMULLWZrm 5134
+VPMULLWZrmk 5135
+VPMULLWZrmkz 5136
+VPMULLWZrr 5137
+VPMULLWZrrk 5138
+VPMULLWZrrkz 5139
+VPMULLWrm 5140
+VPMULLWrr 5141
+VPMULTISHIFTQBZ 5142
+VPMULTISHIFTQBZrm 5143
+VPMULTISHIFTQBZrmb 5144
+VPMULTISHIFTQBZrmbk 5145
+VPMULTISHIFTQBZrmbkz 5146
+VPMULTISHIFTQBZrmk 5147
+VPMULTISHIFTQBZrmkz 5148
+VPMULTISHIFTQBZrr 5149
+VPMULTISHIFTQBZrrk 5150
+VPMULTISHIFTQBZrrkz 5151
+VPMULUDQYrm 5152
+VPMULUDQYrr 5153
+VPMULUDQZ 5154
+VPMULUDQZrm 5155
+VPMULUDQZrmb 5156
+VPMULUDQZrmbk 5157
+VPMULUDQZrmbkz 5158
+VPMULUDQZrmk 5159
+VPMULUDQZrmkz 5160
+VPMULUDQZrr 5161
+VPMULUDQZrrk 5162
+VPMULUDQZrrkz 5163
+VPMULUDQrm 5164
+VPMULUDQrr 5165
+VPOPCNTBZ 5166
+VPOPCNTBZrm 5167
+VPOPCNTBZrmk 5168
+VPOPCNTBZrmkz 5169
+VPOPCNTBZrr 5170
+VPOPCNTBZrrk 5171
+VPOPCNTBZrrkz 5172
+VPOPCNTDZ 5173
+VPOPCNTDZrm 5174
+VPOPCNTDZrmb 5175
+VPOPCNTDZrmbk 5176
+VPOPCNTDZrmbkz 5177
+VPOPCNTDZrmk 5178
+VPOPCNTDZrmkz 5179
+VPOPCNTDZrr 5180
+VPOPCNTDZrrk 5181
+VPOPCNTDZrrkz 5182
+VPOPCNTQZ 5183
+VPOPCNTQZrm 5184
+VPOPCNTQZrmb 5185
+VPOPCNTQZrmbk 5186
+VPOPCNTQZrmbkz 5187
+VPOPCNTQZrmk 5188
+VPOPCNTQZrmkz 5189
+VPOPCNTQZrr 5190
+VPOPCNTQZrrk 5191
+VPOPCNTQZrrkz 5192
+VPOPCNTWZ 5193
+VPOPCNTWZrm 5194
+VPOPCNTWZrmk 5195
+VPOPCNTWZrmkz 5196
+VPOPCNTWZrr 5197
+VPOPCNTWZrrk 5198
+VPOPCNTWZrrkz 5199
+VPORDZ 5200
+VPORDZrm 5201
+VPORDZrmb 5202
+VPORDZrmbk 5203
+VPORDZrmbkz 5204
+VPORDZrmk 5205
+VPORDZrmkz 5206
+VPORDZrr 5207
+VPORDZrrk 5208
+VPORDZrrkz 5209
+VPORQZ 5210
+VPORQZrm 5211
+VPORQZrmb 5212
+VPORQZrmbk 5213
+VPORQZrmbkz 5214
+VPORQZrmk 5215
+VPORQZrmkz 5216
+VPORQZrr 5217
+VPORQZrrk 5218
+VPORQZrrkz 5219
+VPORYrm 5220
+VPORYrr 5221
+VPORrm 5222
+VPORrr 5223
+VPPERMrmr 5224
+VPPERMrrm 5225
+VPPERMrrr 5226
+VPPERMrrr_REV 5227
+VPROLDZ 5228
+VPROLDZmbi 5229
+VPROLDZmbik 5230
+VPROLDZmbikz 5231
+VPROLDZmi 5232
+VPROLDZmik 5233
+VPROLDZmikz 5234
+VPROLDZri 5235
+VPROLDZrik 5236
+VPROLDZrikz 5237
+VPROLQZ 5238
+VPROLQZmbi 5239
+VPROLQZmbik 5240
+VPROLQZmbikz 5241
+VPROLQZmi 5242
+VPROLQZmik 5243
+VPROLQZmikz 5244
+VPROLQZri 5245
+VPROLQZrik 5246
+VPROLQZrikz 5247
+VPROLVDZ 5248
+VPROLVDZrm 5249
+VPROLVDZrmb 5250
+VPROLVDZrmbk 5251
+VPROLVDZrmbkz 5252
+VPROLVDZrmk 5253
+VPROLVDZrmkz 5254
+VPROLVDZrr 5255
+VPROLVDZrrk 5256
+VPROLVDZrrkz 5257
+VPROLVQZ 5258
+VPROLVQZrm 5259
+VPROLVQZrmb 5260
+VPROLVQZrmbk 5261
+VPROLVQZrmbkz 5262
+VPROLVQZrmk 5263
+VPROLVQZrmkz 5264
+VPROLVQZrr 5265
+VPROLVQZrrk 5266
+VPROLVQZrrkz 5267
+VPRORDZ 5268
+VPRORDZmbi 5269
+VPRORDZmbik 5270
+VPRORDZmbikz 5271
+VPRORDZmi 5272
+VPRORDZmik 5273
+VPRORDZmikz 5274
+VPRORDZri 5275
+VPRORDZrik 5276
+VPRORDZrikz 5277
+VPRORQZ 5278
+VPRORQZmbi 5279
+VPRORQZmbik 5280
+VPRORQZmbikz 5281
+VPRORQZmi 5282
+VPRORQZmik 5283
+VPRORQZmikz 5284
+VPRORQZri 5285
+VPRORQZrik 5286
+VPRORQZrikz 5287
+VPRORVDZ 5288
+VPRORVDZrm 5289
+VPRORVDZrmb 5290
+VPRORVDZrmbk 5291
+VPRORVDZrmbkz 5292
+VPRORVDZrmk 5293
+VPRORVDZrmkz 5294
+VPRORVDZrr 5295
+VPRORVDZrrk 5296
+VPRORVDZrrkz 5297
+VPRORVQZ 5298
+VPRORVQZrm 5299
+VPRORVQZrmb 5300
+VPRORVQZrmbk 5301
+VPRORVQZrmbkz 5302
+VPRORVQZrmk 5303
+VPRORVQZrmkz 5304
+VPRORVQZrr 5305
+VPRORVQZrrk 5306
+VPRORVQZrrkz 5307
+VPROTBmi 5308
+VPROTBmr 5309
+VPROTBri 5310
+VPROTBrm 5311
+VPROTBrr 5312
+VPROTBrr_REV 5313
+VPROTDmi 5314
+VPROTDmr 5315
+VPROTDri 5316
+VPROTDrm 5317
+VPROTDrr 5318
+VPROTDrr_REV 5319
+VPROTQmi 5320
+VPROTQmr 5321
+VPROTQri 5322
+VPROTQrm 5323
+VPROTQrr 5324
+VPROTQrr_REV 5325
+VPROTWmi 5326
+VPROTWmr 5327
+VPROTWri 5328
+VPROTWrm 5329
+VPROTWrr 5330
+VPROTWrr_REV 5331
+VPSADBWYrm 5332
+VPSADBWYrr 5333
+VPSADBWZ 5334
+VPSADBWZrm 5335
+VPSADBWZrr 5336
+VPSADBWrm 5337
+VPSADBWrr 5338
+VPSCATTERDDZ 5339
+VPSCATTERDDZmr 5340
+VPSCATTERDQZ 5341
+VPSCATTERDQZmr 5342
+VPSCATTERQDZ 5343
+VPSCATTERQDZmr 5344
+VPSCATTERQQZ 5345
+VPSCATTERQQZmr 5346
+VPSHABmr 5347
+VPSHABrm 5348
+VPSHABrr 5349
+VPSHABrr_REV 5350
+VPSHADmr 5351
+VPSHADrm 5352
+VPSHADrr 5353
+VPSHADrr_REV 5354
+VPSHAQmr 5355
+VPSHAQrm 5356
+VPSHAQrr 5357
+VPSHAQrr_REV 5358
+VPSHAWmr 5359
+VPSHAWrm 5360
+VPSHAWrr 5361
+VPSHAWrr_REV 5362
+VPSHLBmr 5363
+VPSHLBrm 5364
+VPSHLBrr 5365
+VPSHLBrr_REV 5366
+VPSHLDDZ 5367
+VPSHLDDZrmbi 5368
+VPSHLDDZrmbik 5369
+VPSHLDDZrmbikz 5370
+VPSHLDDZrmi 5371
+VPSHLDDZrmik 5372
+VPSHLDDZrmikz 5373
+VPSHLDDZrri 5374
+VPSHLDDZrrik 5375
+VPSHLDDZrrikz 5376
+VPSHLDQZ 5377
+VPSHLDQZrmbi 5378
+VPSHLDQZrmbik 5379
+VPSHLDQZrmbikz 5380
+VPSHLDQZrmi 5381
+VPSHLDQZrmik 5382
+VPSHLDQZrmikz 5383
+VPSHLDQZrri 5384
+VPSHLDQZrrik 5385
+VPSHLDQZrrikz 5386
+VPSHLDVDZ 5387
+VPSHLDVDZm 5388
+VPSHLDVDZmb 5389
+VPSHLDVDZmbk 5390
+VPSHLDVDZmbkz 5391
+VPSHLDVDZmk 5392
+VPSHLDVDZmkz 5393
+VPSHLDVDZr 5394
+VPSHLDVDZrk 5395
+VPSHLDVDZrkz 5396
+VPSHLDVQZ 5397
+VPSHLDVQZm 5398
+VPSHLDVQZmb 5399
+VPSHLDVQZmbk 5400
+VPSHLDVQZmbkz 5401
+VPSHLDVQZmk 5402
+VPSHLDVQZmkz 5403
+VPSHLDVQZr 5404
+VPSHLDVQZrk 5405
+VPSHLDVQZrkz 5406
+VPSHLDVWZ 5407
+VPSHLDVWZm 5408
+VPSHLDVWZmk 5409
+VPSHLDVWZmkz 5410
+VPSHLDVWZr 5411
+VPSHLDVWZrk 5412
+VPSHLDVWZrkz 5413
+VPSHLDWZ 5414
+VPSHLDWZrmi 5415
+VPSHLDWZrmik 5416
+VPSHLDWZrmikz 5417
+VPSHLDWZrri 5418
+VPSHLDWZrrik 5419
+VPSHLDWZrrikz 5420
+VPSHLDmr 5421
+VPSHLDrm 5422
+VPSHLDrr 5423
+VPSHLDrr_REV 5424
+VPSHLQmr 5425
+VPSHLQrm 5426
+VPSHLQrr 5427
+VPSHLQrr_REV 5428
+VPSHLWmr 5429
+VPSHLWrm 5430
+VPSHLWrr 5431
+VPSHLWrr_REV 5432
+VPSHRDDZ 5433
+VPSHRDDZrmbi 5434
+VPSHRDDZrmbik 5435
+VPSHRDDZrmbikz 5436
+VPSHRDDZrmi 5437
+VPSHRDDZrmik 5438
+VPSHRDDZrmikz 5439
+VPSHRDDZrri 5440
+VPSHRDDZrrik 5441
+VPSHRDDZrrikz 5442
+VPSHRDQZ 5443
+VPSHRDQZrmbi 5444
+VPSHRDQZrmbik 5445
+VPSHRDQZrmbikz 5446
+VPSHRDQZrmi 5447
+VPSHRDQZrmik 5448
+VPSHRDQZrmikz 5449
+VPSHRDQZrri 5450
+VPSHRDQZrrik 5451
+VPSHRDQZrrikz 5452
+VPSHRDVDZ 5453
+VPSHRDVDZm 5454
+VPSHRDVDZmb 5455
+VPSHRDVDZmbk 5456
+VPSHRDVDZmbkz 5457
+VPSHRDVDZmk 5458
+VPSHRDVDZmkz 5459
+VPSHRDVDZr 5460
+VPSHRDVDZrk 5461
+VPSHRDVDZrkz 5462
+VPSHRDVQZ 5463
+VPSHRDVQZm 5464
+VPSHRDVQZmb 5465
+VPSHRDVQZmbk 5466
+VPSHRDVQZmbkz 5467
+VPSHRDVQZmk 5468
+VPSHRDVQZmkz 5469
+VPSHRDVQZr 5470
+VPSHRDVQZrk 5471
+VPSHRDVQZrkz 5472
+VPSHRDVWZ 5473
+VPSHRDVWZm 5474
+VPSHRDVWZmk 5475
+VPSHRDVWZmkz 5476
+VPSHRDVWZr 5477
+VPSHRDVWZrk 5478
+VPSHRDVWZrkz 5479
+VPSHRDWZ 5480
+VPSHRDWZrmi 5481
+VPSHRDWZrmik 5482
+VPSHRDWZrmikz 5483
+VPSHRDWZrri 5484
+VPSHRDWZrrik 5485
+VPSHRDWZrrikz 5486
+VPSHUFBITQMBZ 5487
+VPSHUFBITQMBZrm 5488
+VPSHUFBITQMBZrmk 5489
+VPSHUFBITQMBZrr 5490
+VPSHUFBITQMBZrrk 5491
+VPSHUFBYrm 5492
+VPSHUFBYrr 5493
+VPSHUFBZ 5494
+VPSHUFBZrm 5495
+VPSHUFBZrmk 5496
+VPSHUFBZrmkz 5497
+VPSHUFBZrr 5498
+VPSHUFBZrrk 5499
+VPSHUFBZrrkz 5500
+VPSHUFBrm 5501
+VPSHUFBrr 5502
+VPSHUFDYmi 5503
+VPSHUFDYri 5504
+VPSHUFDZ 5505
+VPSHUFDZmbi 5506
+VPSHUFDZmbik 5507
+VPSHUFDZmbikz 5508
+VPSHUFDZmi 5509
+VPSHUFDZmik 5510
+VPSHUFDZmikz 5511
+VPSHUFDZri 5512
+VPSHUFDZrik 5513
+VPSHUFDZrikz 5514
+VPSHUFDmi 5515
+VPSHUFDri 5516
+VPSHUFHWYmi 5517
+VPSHUFHWYri 5518
+VPSHUFHWZ 5519
+VPSHUFHWZmi 5520
+VPSHUFHWZmik 5521
+VPSHUFHWZmikz 5522
+VPSHUFHWZri 5523
+VPSHUFHWZrik 5524
+VPSHUFHWZrikz 5525
+VPSHUFHWmi 5526
+VPSHUFHWri 5527
+VPSHUFLWYmi 5528
+VPSHUFLWYri 5529
+VPSHUFLWZ 5530
+VPSHUFLWZmi 5531
+VPSHUFLWZmik 5532
+VPSHUFLWZmikz 5533
+VPSHUFLWZri 5534
+VPSHUFLWZrik 5535
+VPSHUFLWZrikz 5536
+VPSHUFLWmi 5537
+VPSHUFLWri 5538
+VPSIGNBYrm 5539
+VPSIGNBYrr 5540
+VPSIGNBrm 5541
+VPSIGNBrr 5542
+VPSIGNDYrm 5543
+VPSIGNDYrr 5544
+VPSIGNDrm 5545
+VPSIGNDrr 5546
+VPSIGNWYrm 5547
+VPSIGNWYrr 5548
+VPSIGNWrm 5549
+VPSIGNWrr 5550
+VPSLLDQYri 5551
+VPSLLDQZ 5552
+VPSLLDQZmi 5553
+VPSLLDQZri 5554
+VPSLLDQri 5555
+VPSLLDYri 5556
+VPSLLDYrm 5557
+VPSLLDYrr 5558
+VPSLLDZ 5559
+VPSLLDZmbi 5560
+VPSLLDZmbik 5561
+VPSLLDZmbikz 5562
+VPSLLDZmi 5563
+VPSLLDZmik 5564
+VPSLLDZmikz 5565
+VPSLLDZri 5566
+VPSLLDZrik 5567
+VPSLLDZrikz 5568
+VPSLLDZrm 5569
+VPSLLDZrmk 5570
+VPSLLDZrmkz 5571
+VPSLLDZrr 5572
+VPSLLDZrrk 5573
+VPSLLDZrrkz 5574
+VPSLLDri 5575
+VPSLLDrm 5576
+VPSLLDrr 5577
+VPSLLQYri 5578
+VPSLLQYrm 5579
+VPSLLQYrr 5580
+VPSLLQZ 5581
+VPSLLQZmbi 5582
+VPSLLQZmbik 5583
+VPSLLQZmbikz 5584
+VPSLLQZmi 5585
+VPSLLQZmik 5586
+VPSLLQZmikz 5587
+VPSLLQZri 5588
+VPSLLQZrik 5589
+VPSLLQZrikz 5590
+VPSLLQZrm 5591
+VPSLLQZrmk 5592
+VPSLLQZrmkz 5593
+VPSLLQZrr 5594
+VPSLLQZrrk 5595
+VPSLLQZrrkz 5596
+VPSLLQri 5597
+VPSLLQrm 5598
+VPSLLQrr 5599
+VPSLLVDYrm 5600
+VPSLLVDYrr 5601
+VPSLLVDZ 5602
+VPSLLVDZrm 5603
+VPSLLVDZrmb 5604
+VPSLLVDZrmbk 5605
+VPSLLVDZrmbkz 5606
+VPSLLVDZrmk 5607
+VPSLLVDZrmkz 5608
+VPSLLVDZrr 5609
+VPSLLVDZrrk 5610
+VPSLLVDZrrkz 5611
+VPSLLVDrm 5612
+VPSLLVDrr 5613
+VPSLLVQYrm 5614
+VPSLLVQYrr 5615
+VPSLLVQZ 5616
+VPSLLVQZrm 5617
+VPSLLVQZrmb 5618
+VPSLLVQZrmbk 5619
+VPSLLVQZrmbkz 5620
+VPSLLVQZrmk 5621
+VPSLLVQZrmkz 5622
+VPSLLVQZrr 5623
+VPSLLVQZrrk 5624
+VPSLLVQZrrkz 5625
+VPSLLVQrm 5626
+VPSLLVQrr 5627
+VPSLLVWZ 5628
+VPSLLVWZrm 5629
+VPSLLVWZrmk 5630
+VPSLLVWZrmkz 5631
+VPSLLVWZrr 5632
+VPSLLVWZrrk 5633
+VPSLLVWZrrkz 5634
+VPSLLWYri 5635
+VPSLLWYrm 5636
+VPSLLWYrr 5637
+VPSLLWZ 5638
+VPSLLWZmi 5639
+VPSLLWZmik 5640
+VPSLLWZmikz 5641
+VPSLLWZri 5642
+VPSLLWZrik 5643
+VPSLLWZrikz 5644
+VPSLLWZrm 5645
+VPSLLWZrmk 5646
+VPSLLWZrmkz 5647
+VPSLLWZrr 5648
+VPSLLWZrrk 5649
+VPSLLWZrrkz 5650
+VPSLLWri 5651
+VPSLLWrm 5652
+VPSLLWrr 5653
+VPSRADYri 5654
+VPSRADYrm 5655
+VPSRADYrr 5656
+VPSRADZ 5657
+VPSRADZmbi 5658
+VPSRADZmbik 5659
+VPSRADZmbikz 5660
+VPSRADZmi 5661
+VPSRADZmik 5662
+VPSRADZmikz 5663
+VPSRADZri 5664
+VPSRADZrik 5665
+VPSRADZrikz 5666
+VPSRADZrm 5667
+VPSRADZrmk 5668
+VPSRADZrmkz 5669
+VPSRADZrr 5670
+VPSRADZrrk 5671
+VPSRADZrrkz 5672
+VPSRADri 5673
+VPSRADrm 5674
+VPSRADrr 5675
+VPSRAQZ 5676
+VPSRAQZmbi 5677
+VPSRAQZmbik 5678
+VPSRAQZmbikz 5679
+VPSRAQZmi 5680
+VPSRAQZmik 5681
+VPSRAQZmikz 5682
+VPSRAQZri 5683
+VPSRAQZrik 5684
+VPSRAQZrikz 5685
+VPSRAQZrm 5686
+VPSRAQZrmk 5687
+VPSRAQZrmkz 5688
+VPSRAQZrr 5689
+VPSRAQZrrk 5690
+VPSRAQZrrkz 5691
+VPSRAVDYrm 5692
+VPSRAVDYrr 5693
+VPSRAVDZ 5694
+VPSRAVDZrm 5695
+VPSRAVDZrmb 5696
+VPSRAVDZrmbk 5697
+VPSRAVDZrmbkz 5698
+VPSRAVDZrmk 5699
+VPSRAVDZrmkz 5700
+VPSRAVDZrr 5701
+VPSRAVDZrrk 5702
+VPSRAVDZrrkz 5703
+VPSRAVDrm 5704
+VPSRAVDrr 5705
+VPSRAVQZ 5706
+VPSRAVQZrm 5707
+VPSRAVQZrmb 5708
+VPSRAVQZrmbk 5709
+VPSRAVQZrmbkz 5710
+VPSRAVQZrmk 5711
+VPSRAVQZrmkz 5712
+VPSRAVQZrr 5713
+VPSRAVQZrrk 5714
+VPSRAVQZrrkz 5715
+VPSRAVWZ 5716
+VPSRAVWZrm 5717
+VPSRAVWZrmk 5718
+VPSRAVWZrmkz 5719
+VPSRAVWZrr 5720
+VPSRAVWZrrk 5721
+VPSRAVWZrrkz 5722
+VPSRAWYri 5723
+VPSRAWYrm 5724
+VPSRAWYrr 5725
+VPSRAWZ 5726
+VPSRAWZmi 5727
+VPSRAWZmik 5728
+VPSRAWZmikz 5729
+VPSRAWZri 5730
+VPSRAWZrik 5731
+VPSRAWZrikz 5732
+VPSRAWZrm 5733
+VPSRAWZrmk 5734
+VPSRAWZrmkz 5735
+VPSRAWZrr 5736
+VPSRAWZrrk 5737
+VPSRAWZrrkz 5738
+VPSRAWri 5739
+VPSRAWrm 5740
+VPSRAWrr 5741
+VPSRLDQYri 5742
+VPSRLDQZ 5743
+VPSRLDQZmi 5744
+VPSRLDQZri 5745
+VPSRLDQri 5746
+VPSRLDYri 5747
+VPSRLDYrm 5748
+VPSRLDYrr 5749
+VPSRLDZ 5750
+VPSRLDZmbi 5751
+VPSRLDZmbik 5752
+VPSRLDZmbikz 5753
+VPSRLDZmi 5754
+VPSRLDZmik 5755
+VPSRLDZmikz 5756
+VPSRLDZri 5757
+VPSRLDZrik 5758
+VPSRLDZrikz 5759
+VPSRLDZrm 5760
+VPSRLDZrmk 5761
+VPSRLDZrmkz 5762
+VPSRLDZrr 5763
+VPSRLDZrrk 5764
+VPSRLDZrrkz 5765
+VPSRLDri 5766
+VPSRLDrm 5767
+VPSRLDrr 5768
+VPSRLQYri 5769
+VPSRLQYrm 5770
+VPSRLQYrr 5771
+VPSRLQZ 5772
+VPSRLQZmbi 5773
+VPSRLQZmbik 5774
+VPSRLQZmbikz 5775
+VPSRLQZmi 5776
+VPSRLQZmik 5777
+VPSRLQZmikz 5778
+VPSRLQZri 5779
+VPSRLQZrik 5780
+VPSRLQZrikz 5781
+VPSRLQZrm 5782
+VPSRLQZrmk 5783
+VPSRLQZrmkz 5784
+VPSRLQZrr 5785
+VPSRLQZrrk 5786
+VPSRLQZrrkz 5787
+VPSRLQri 5788
+VPSRLQrm 5789
+VPSRLQrr 5790
+VPSRLVDYrm 5791
+VPSRLVDYrr 5792
+VPSRLVDZ 5793
+VPSRLVDZrm 5794
+VPSRLVDZrmb 5795
+VPSRLVDZrmbk 5796
+VPSRLVDZrmbkz 5797
+VPSRLVDZrmk 5798
+VPSRLVDZrmkz 5799
+VPSRLVDZrr 5800
+VPSRLVDZrrk 5801
+VPSRLVDZrrkz 5802
+VPSRLVDrm 5803
+VPSRLVDrr 5804
+VPSRLVQYrm 5805
+VPSRLVQYrr 5806
+VPSRLVQZ 5807
+VPSRLVQZrm 5808
+VPSRLVQZrmb 5809
+VPSRLVQZrmbk 5810
+VPSRLVQZrmbkz 5811
+VPSRLVQZrmk 5812
+VPSRLVQZrmkz 5813
+VPSRLVQZrr 5814
+VPSRLVQZrrk 5815
+VPSRLVQZrrkz 5816
+VPSRLVQrm 5817
+VPSRLVQrr 5818
+VPSRLVWZ 5819
+VPSRLVWZrm 5820
+VPSRLVWZrmk 5821
+VPSRLVWZrmkz 5822
+VPSRLVWZrr 5823
+VPSRLVWZrrk 5824
+VPSRLVWZrrkz 5825
+VPSRLWYri 5826
+VPSRLWYrm 5827
+VPSRLWYrr 5828
+VPSRLWZ 5829
+VPSRLWZmi 5830
+VPSRLWZmik 5831
+VPSRLWZmikz 5832
+VPSRLWZri 5833
+VPSRLWZrik 5834
+VPSRLWZrikz 5835
+VPSRLWZrm 5836
+VPSRLWZrmk 5837
+VPSRLWZrmkz 5838
+VPSRLWZrr 5839
+VPSRLWZrrk 5840
+VPSRLWZrrkz 5841
+VPSRLWri 5842
+VPSRLWrm 5843
+VPSRLWrr 5844
+VPSUBBYrm 5845
+VPSUBBYrr 5846
+VPSUBBZ 5847
+VPSUBBZrm 5848
+VPSUBBZrmk 5849
+VPSUBBZrmkz 5850
+VPSUBBZrr 5851
+VPSUBBZrrk 5852
+VPSUBBZrrkz 5853
+VPSUBBrm 5854
+VPSUBBrr 5855
+VPSUBDYrm 5856
+VPSUBDYrr 5857
+VPSUBDZ 5858
+VPSUBDZrm 5859
+VPSUBDZrmb 5860
+VPSUBDZrmbk 5861
+VPSUBDZrmbkz 5862
+VPSUBDZrmk 5863
+VPSUBDZrmkz 5864
+VPSUBDZrr 5865
+VPSUBDZrrk 5866
+VPSUBDZrrkz 5867
+VPSUBDrm 5868
+VPSUBDrr 5869
+VPSUBQYrm 5870
+VPSUBQYrr 5871
+VPSUBQZ 5872
+VPSUBQZrm 5873
+VPSUBQZrmb 5874
+VPSUBQZrmbk 5875
+VPSUBQZrmbkz 5876
+VPSUBQZrmk 5877
+VPSUBQZrmkz 5878
+VPSUBQZrr 5879
+VPSUBQZrrk 5880
+VPSUBQZrrkz 5881
+VPSUBQrm 5882
+VPSUBQrr 5883
+VPSUBSBYrm 5884
+VPSUBSBYrr 5885
+VPSUBSBZ 5886
+VPSUBSBZrm 5887
+VPSUBSBZrmk 5888
+VPSUBSBZrmkz 5889
+VPSUBSBZrr 5890
+VPSUBSBZrrk 5891
+VPSUBSBZrrkz 5892
+VPSUBSBrm 5893
+VPSUBSBrr 5894
+VPSUBSWYrm 5895
+VPSUBSWYrr 5896
+VPSUBSWZ 5897
+VPSUBSWZrm 5898
+VPSUBSWZrmk 5899
+VPSUBSWZrmkz 5900
+VPSUBSWZrr 5901
+VPSUBSWZrrk 5902
+VPSUBSWZrrkz 5903
+VPSUBSWrm 5904
+VPSUBSWrr 5905
+VPSUBUSBYrm 5906
+VPSUBUSBYrr 5907
+VPSUBUSBZ 5908
+VPSUBUSBZrm 5909
+VPSUBUSBZrmk 5910
+VPSUBUSBZrmkz 5911
+VPSUBUSBZrr 5912
+VPSUBUSBZrrk 5913
+VPSUBUSBZrrkz 5914
+VPSUBUSBrm 5915
+VPSUBUSBrr 5916
+VPSUBUSWYrm 5917
+VPSUBUSWYrr 5918
+VPSUBUSWZ 5919
+VPSUBUSWZrm 5920
+VPSUBUSWZrmk 5921
+VPSUBUSWZrmkz 5922
+VPSUBUSWZrr 5923
+VPSUBUSWZrrk 5924
+VPSUBUSWZrrkz 5925
+VPSUBUSWrm 5926
+VPSUBUSWrr 5927
+VPSUBWYrm 5928
+VPSUBWYrr 5929
+VPSUBWZ 5930
+VPSUBWZrm 5931
+VPSUBWZrmk 5932
+VPSUBWZrmkz 5933
+VPSUBWZrr 5934
+VPSUBWZrrk 5935
+VPSUBWZrrkz 5936
+VPSUBWrm 5937
+VPSUBWrr 5938
+VPTERNLOGDZ 5939
+VPTERNLOGDZrmbi 5940
+VPTERNLOGDZrmbik 5941
+VPTERNLOGDZrmbikz 5942
+VPTERNLOGDZrmi 5943
+VPTERNLOGDZrmik 5944
+VPTERNLOGDZrmikz 5945
+VPTERNLOGDZrri 5946
+VPTERNLOGDZrrik 5947
+VPTERNLOGDZrrikz 5948
+VPTERNLOGQZ 5949
+VPTERNLOGQZrmbi 5950
+VPTERNLOGQZrmbik 5951
+VPTERNLOGQZrmbikz 5952
+VPTERNLOGQZrmi 5953
+VPTERNLOGQZrmik 5954
+VPTERNLOGQZrmikz 5955
+VPTERNLOGQZrri 5956
+VPTERNLOGQZrrik 5957
+VPTERNLOGQZrrikz 5958
+VPTESTMBZ 5959
+VPTESTMBZrm 5960
+VPTESTMBZrmk 5961
+VPTESTMBZrr 5962
+VPTESTMBZrrk 5963
+VPTESTMDZ 5964
+VPTESTMDZrm 5965
+VPTESTMDZrmb 5966
+VPTESTMDZrmbk 5967
+VPTESTMDZrmk 5968
+VPTESTMDZrr 5969
+VPTESTMDZrrk 5970
+VPTESTMQZ 5971
+VPTESTMQZrm 5972
+VPTESTMQZrmb 5973
+VPTESTMQZrmbk 5974
+VPTESTMQZrmk 5975
+VPTESTMQZrr 5976
+VPTESTMQZrrk 5977
+VPTESTMWZ 5978
+VPTESTMWZrm 5979
+VPTESTMWZrmk 5980
+VPTESTMWZrr 5981
+VPTESTMWZrrk 5982
+VPTESTNMBZ 5983
+VPTESTNMBZrm 5984
+VPTESTNMBZrmk 5985
+VPTESTNMBZrr 5986
+VPTESTNMBZrrk 5987
+VPTESTNMDZ 5988
+VPTESTNMDZrm 5989
+VPTESTNMDZrmb 5990
+VPTESTNMDZrmbk 5991
+VPTESTNMDZrmk 5992
+VPTESTNMDZrr 5993
+VPTESTNMDZrrk 5994
+VPTESTNMQZ 5995
+VPTESTNMQZrm 5996
+VPTESTNMQZrmb 5997
+VPTESTNMQZrmbk 5998
+VPTESTNMQZrmk 5999
+VPTESTNMQZrr 6000
+VPTESTNMQZrrk 6001
+VPTESTNMWZ 6002
+VPTESTNMWZrm 6003
+VPTESTNMWZrmk 6004
+VPTESTNMWZrr 6005
+VPTESTNMWZrrk 6006
+VPTESTYrm 6007
+VPTESTYrr 6008
+VPTESTrm 6009
+VPTESTrr 6010
+VPUNPCKHBWYrm 6011
+VPUNPCKHBWYrr 6012
+VPUNPCKHBWZ 6013
+VPUNPCKHBWZrm 6014
+VPUNPCKHBWZrmk 6015
+VPUNPCKHBWZrmkz 6016
+VPUNPCKHBWZrr 6017
+VPUNPCKHBWZrrk 6018
+VPUNPCKHBWZrrkz 6019
+VPUNPCKHBWrm 6020
+VPUNPCKHBWrr 6021
+VPUNPCKHDQYrm 6022
+VPUNPCKHDQYrr 6023
+VPUNPCKHDQZ 6024
+VPUNPCKHDQZrm 6025
+VPUNPCKHDQZrmb 6026
+VPUNPCKHDQZrmbk 6027
+VPUNPCKHDQZrmbkz 6028
+VPUNPCKHDQZrmk 6029
+VPUNPCKHDQZrmkz 6030
+VPUNPCKHDQZrr 6031
+VPUNPCKHDQZrrk 6032
+VPUNPCKHDQZrrkz 6033
+VPUNPCKHDQrm 6034
+VPUNPCKHDQrr 6035
+VPUNPCKHQDQYrm 6036
+VPUNPCKHQDQYrr 6037
+VPUNPCKHQDQZ 6038
+VPUNPCKHQDQZrm 6039
+VPUNPCKHQDQZrmb 6040
+VPUNPCKHQDQZrmbk 6041
+VPUNPCKHQDQZrmbkz 6042
+VPUNPCKHQDQZrmk 6043
+VPUNPCKHQDQZrmkz 6044
+VPUNPCKHQDQZrr 6045
+VPUNPCKHQDQZrrk 6046
+VPUNPCKHQDQZrrkz 6047
+VPUNPCKHQDQrm 6048
+VPUNPCKHQDQrr 6049
+VPUNPCKHWDYrm 6050
+VPUNPCKHWDYrr 6051
+VPUNPCKHWDZ 6052
+VPUNPCKHWDZrm 6053
+VPUNPCKHWDZrmk 6054
+VPUNPCKHWDZrmkz 6055
+VPUNPCKHWDZrr 6056
+VPUNPCKHWDZrrk 6057
+VPUNPCKHWDZrrkz 6058
+VPUNPCKHWDrm 6059
+VPUNPCKHWDrr 6060
+VPUNPCKLBWYrm 6061
+VPUNPCKLBWYrr 6062
+VPUNPCKLBWZ 6063
+VPUNPCKLBWZrm 6064
+VPUNPCKLBWZrmk 6065
+VPUNPCKLBWZrmkz 6066
+VPUNPCKLBWZrr 6067
+VPUNPCKLBWZrrk 6068
+VPUNPCKLBWZrrkz 6069
+VPUNPCKLBWrm 6070
+VPUNPCKLBWrr 6071
+VPUNPCKLDQYrm 6072
+VPUNPCKLDQYrr 6073
+VPUNPCKLDQZ 6074
+VPUNPCKLDQZrm 6075
+VPUNPCKLDQZrmb 6076
+VPUNPCKLDQZrmbk 6077
+VPUNPCKLDQZrmbkz 6078
+VPUNPCKLDQZrmk 6079
+VPUNPCKLDQZrmkz 6080
+VPUNPCKLDQZrr 6081
+VPUNPCKLDQZrrk 6082
+VPUNPCKLDQZrrkz 6083
+VPUNPCKLDQrm 6084
+VPUNPCKLDQrr 6085
+VPUNPCKLQDQYrm 6086
+VPUNPCKLQDQYrr 6087
+VPUNPCKLQDQZ 6088
+VPUNPCKLQDQZrm 6089
+VPUNPCKLQDQZrmb 6090
+VPUNPCKLQDQZrmbk 6091
+VPUNPCKLQDQZrmbkz 6092
+VPUNPCKLQDQZrmk 6093
+VPUNPCKLQDQZrmkz 6094
+VPUNPCKLQDQZrr 6095
+VPUNPCKLQDQZrrk 6096
+VPUNPCKLQDQZrrkz 6097
+VPUNPCKLQDQrm 6098
+VPUNPCKLQDQrr 6099
+VPUNPCKLWDYrm 6100
+VPUNPCKLWDYrr 6101
+VPUNPCKLWDZ 6102
+VPUNPCKLWDZrm 6103
+VPUNPCKLWDZrmk 6104
+VPUNPCKLWDZrmkz 6105
+VPUNPCKLWDZrr 6106
+VPUNPCKLWDZrrk 6107
+VPUNPCKLWDZrrkz 6108
+VPUNPCKLWDrm 6109
+VPUNPCKLWDrr 6110
+VPXORDZ 6111
+VPXORDZrm 6112
+VPXORDZrmb 6113
+VPXORDZrmbk 6114
+VPXORDZrmbkz 6115
+VPXORDZrmk 6116
+VPXORDZrmkz 6117
+VPXORDZrr 6118
+VPXORDZrrk 6119
+VPXORDZrrkz 6120
+VPXORQZ 6121
+VPXORQZrm 6122
+VPXORQZrmb 6123
+VPXORQZrmbk 6124
+VPXORQZrmbkz 6125
+VPXORQZrmk 6126
+VPXORQZrmkz 6127
+VPXORQZrr 6128
+VPXORQZrrk 6129
+VPXORQZrrkz 6130
+VPXORYrm 6131
+VPXORYrr 6132
+VPXORrm 6133
+VPXORrr 6134
+VRANGEPDZ 6135
+VRANGEPDZrmbi 6136
+VRANGEPDZrmbik 6137
+VRANGEPDZrmbikz 6138
+VRANGEPDZrmi 6139
+VRANGEPDZrmik 6140
+VRANGEPDZrmikz 6141
+VRANGEPDZrri 6142
+VRANGEPDZrrib 6143
+VRANGEPDZrribk 6144
+VRANGEPDZrribkz 6145
+VRANGEPDZrrik 6146
+VRANGEPDZrrikz 6147
+VRANGEPSZ 6148
+VRANGEPSZrmbi 6149
+VRANGEPSZrmbik 6150
+VRANGEPSZrmbikz 6151
+VRANGEPSZrmi 6152
+VRANGEPSZrmik 6153
+VRANGEPSZrmikz 6154
+VRANGEPSZrri 6155
+VRANGEPSZrrib 6156
+VRANGEPSZrribk 6157
+VRANGEPSZrribkz 6158
+VRANGEPSZrrik 6159
+VRANGEPSZrrikz 6160
+VRANGESDZrmi 6161
+VRANGESDZrmik 6162
+VRANGESDZrmikz 6163
+VRANGESDZrri 6164
+VRANGESDZrrib 6165
+VRANGESDZrribk 6166
+VRANGESDZrribkz 6167
+VRANGESDZrrik 6168
+VRANGESDZrrikz 6169
+VRANGESSZrmi 6170
+VRANGESSZrmik 6171
+VRANGESSZrmikz 6172
+VRANGESSZrri 6173
+VRANGESSZrrib 6174
+VRANGESSZrribk 6175
+VRANGESSZrribkz 6176
+VRANGESSZrrik 6177
+VRANGESSZrrikz 6178
+VRCP 6179
+VRCPBF 6180
+VRCPPHZ 6181
+VRCPPHZm 6182
+VRCPPHZmb 6183
+VRCPPHZmbk 6184
+VRCPPHZmbkz 6185
+VRCPPHZmk 6186
+VRCPPHZmkz 6187
+VRCPPHZr 6188
+VRCPPHZrk 6189
+VRCPPHZrkz 6190
+VRCPPSYm 6191
+VRCPPSYr 6192
+VRCPPSm 6193
+VRCPPSr 6194
+VRCPSHZrm 6195
+VRCPSHZrmk 6196
+VRCPSHZrmkz 6197
+VRCPSHZrr 6198
+VRCPSHZrrk 6199
+VRCPSHZrrkz 6200
+VRCPSSm 6201
+VRCPSSm_Int 6202
+VRCPSSr 6203
+VRCPSSr_Int 6204
+VREDUCEBF 6205
+VREDUCEPDZ 6206
+VREDUCEPDZrmbi 6207
+VREDUCEPDZrmbik 6208
+VREDUCEPDZrmbikz 6209
+VREDUCEPDZrmi 6210
+VREDUCEPDZrmik 6211
+VREDUCEPDZrmikz 6212
+VREDUCEPDZrri 6213
+VREDUCEPDZrrib 6214
+VREDUCEPDZrribk 6215
+VREDUCEPDZrribkz 6216
+VREDUCEPDZrrik 6217
+VREDUCEPDZrrikz 6218
+VREDUCEPHZ 6219
+VREDUCEPHZrmbi 6220
+VREDUCEPHZrmbik 6221
+VREDUCEPHZrmbikz 6222
+VREDUCEPHZrmi 6223
+VREDUCEPHZrmik 6224
+VREDUCEPHZrmikz 6225
+VREDUCEPHZrri 6226
+VREDUCEPHZrrib 6227
+VREDUCEPHZrribk 6228
+VREDUCEPHZrribkz 6229
+VREDUCEPHZrrik 6230
+VREDUCEPHZrrikz 6231
+VREDUCEPSZ 6232
+VREDUCEPSZrmbi 6233
+VREDUCEPSZrmbik 6234
+VREDUCEPSZrmbikz 6235
+VREDUCEPSZrmi 6236
+VREDUCEPSZrmik 6237
+VREDUCEPSZrmikz 6238
+VREDUCEPSZrri 6239
+VREDUCEPSZrrib 6240
+VREDUCEPSZrribk 6241
+VREDUCEPSZrribkz 6242
+VREDUCEPSZrrik 6243
+VREDUCEPSZrrikz 6244
+VREDUCESDZrmi 6245
+VREDUCESDZrmik 6246
+VREDUCESDZrmikz 6247
+VREDUCESDZrri 6248
+VREDUCESDZrrib 6249
+VREDUCESDZrribk 6250
+VREDUCESDZrribkz 6251
+VREDUCESDZrrik 6252
+VREDUCESDZrrikz 6253
+VREDUCESHZrmi 6254
+VREDUCESHZrmik 6255
+VREDUCESHZrmikz 6256
+VREDUCESHZrri 6257
+VREDUCESHZrrib 6258
+VREDUCESHZrribk 6259
+VREDUCESHZrribkz 6260
+VREDUCESHZrrik 6261
+VREDUCESHZrrikz 6262
+VREDUCESSZrmi 6263
+VREDUCESSZrmik 6264
+VREDUCESSZrmikz 6265
+VREDUCESSZrri 6266
+VREDUCESSZrrib 6267
+VREDUCESSZrribk 6268
+VREDUCESSZrribkz 6269
+VREDUCESSZrrik 6270
+VREDUCESSZrrikz 6271
+VRNDSCALEBF 6272
+VRNDSCALEPDZ 6273
+VRNDSCALEPDZrmbi 6274
+VRNDSCALEPDZrmbik 6275
+VRNDSCALEPDZrmbikz 6276
+VRNDSCALEPDZrmi 6277
+VRNDSCALEPDZrmik 6278
+VRNDSCALEPDZrmikz 6279
+VRNDSCALEPDZrri 6280
+VRNDSCALEPDZrrib 6281
+VRNDSCALEPDZrribk 6282
+VRNDSCALEPDZrribkz 6283
+VRNDSCALEPDZrrik 6284
+VRNDSCALEPDZrrikz 6285
+VRNDSCALEPHZ 6286
+VRNDSCALEPHZrmbi 6287
+VRNDSCALEPHZrmbik 6288
+VRNDSCALEPHZrmbikz 6289
+VRNDSCALEPHZrmi 6290
+VRNDSCALEPHZrmik 6291
+VRNDSCALEPHZrmikz 6292
+VRNDSCALEPHZrri 6293
+VRNDSCALEPHZrrib 6294
+VRNDSCALEPHZrribk 6295
+VRNDSCALEPHZrribkz 6296
+VRNDSCALEPHZrrik 6297
+VRNDSCALEPHZrrikz 6298
+VRNDSCALEPSZ 6299
+VRNDSCALEPSZrmbi 6300
+VRNDSCALEPSZrmbik 6301
+VRNDSCALEPSZrmbikz 6302
+VRNDSCALEPSZrmi 6303
+VRNDSCALEPSZrmik 6304
+VRNDSCALEPSZrmikz 6305
+VRNDSCALEPSZrri 6306
+VRNDSCALEPSZrrib 6307
+VRNDSCALEPSZrribk 6308
+VRNDSCALEPSZrribkz 6309
+VRNDSCALEPSZrrik 6310
+VRNDSCALEPSZrrikz 6311
+VRNDSCALESDZrmi 6312
+VRNDSCALESDZrmi_Int 6313
+VRNDSCALESDZrmik_Int 6314
+VRNDSCALESDZrmikz_Int 6315
+VRNDSCALESDZrri 6316
+VRNDSCALESDZrri_Int 6317
+VRNDSCALESDZrrib_Int 6318
+VRNDSCALESDZrribk_Int 6319
+VRNDSCALESDZrribkz_Int 6320
+VRNDSCALESDZrrik_Int 6321
+VRNDSCALESDZrrikz_Int 6322
+VRNDSCALESHZrmi 6323
+VRNDSCALESHZrmi_Int 6324
+VRNDSCALESHZrmik_Int 6325
+VRNDSCALESHZrmikz_Int 6326
+VRNDSCALESHZrri 6327
+VRNDSCALESHZrri_Int 6328
+VRNDSCALESHZrrib_Int 6329
+VRNDSCALESHZrribk_Int 6330
+VRNDSCALESHZrribkz_Int 6331
+VRNDSCALESHZrrik_Int 6332
+VRNDSCALESHZrrikz_Int 6333
+VRNDSCALESSZrmi 6334
+VRNDSCALESSZrmi_Int 6335
+VRNDSCALESSZrmik_Int 6336
+VRNDSCALESSZrmikz_Int 6337
+VRNDSCALESSZrri 6338
+VRNDSCALESSZrri_Int 6339
+VRNDSCALESSZrrib_Int 6340
+VRNDSCALESSZrribk_Int 6341
+VRNDSCALESSZrribkz_Int 6342
+VRNDSCALESSZrrik_Int 6343
+VRNDSCALESSZrrikz_Int 6344
+VROUNDPDYmi 6345
+VROUNDPDYri 6346
+VROUNDPDmi 6347
+VROUNDPDri 6348
+VROUNDPSYmi 6349
+VROUNDPSYri 6350
+VROUNDPSmi 6351
+VROUNDPSri 6352
+VROUNDSDmi 6353
+VROUNDSDmi_Int 6354
+VROUNDSDri 6355
+VROUNDSDri_Int 6356
+VROUNDSSmi 6357
+VROUNDSSmi_Int 6358
+VROUNDSSri 6359
+VROUNDSSri_Int 6360
+VRSQRT 6361
+VRSQRTBF 6362
+VRSQRTPHZ 6363
+VRSQRTPHZm 6364
+VRSQRTPHZmb 6365
+VRSQRTPHZmbk 6366
+VRSQRTPHZmbkz 6367
+VRSQRTPHZmk 6368
+VRSQRTPHZmkz 6369
+VRSQRTPHZr 6370
+VRSQRTPHZrk 6371
+VRSQRTPHZrkz 6372
+VRSQRTPSYm 6373
+VRSQRTPSYr 6374
+VRSQRTPSm 6375
+VRSQRTPSr 6376
+VRSQRTSHZrm 6377
+VRSQRTSHZrmk 6378
+VRSQRTSHZrmkz 6379
+VRSQRTSHZrr 6380
+VRSQRTSHZrrk 6381
+VRSQRTSHZrrkz 6382
+VRSQRTSSm 6383
+VRSQRTSSm_Int 6384
+VRSQRTSSr 6385
+VRSQRTSSr_Int 6386
+VSCALEFBF 6387
+VSCALEFPDZ 6388
+VSCALEFPDZrm 6389
+VSCALEFPDZrmb 6390
+VSCALEFPDZrmbk 6391
+VSCALEFPDZrmbkz 6392
+VSCALEFPDZrmk 6393
+VSCALEFPDZrmkz 6394
+VSCALEFPDZrr 6395
+VSCALEFPDZrrb 6396
+VSCALEFPDZrrbk 6397
+VSCALEFPDZrrbkz 6398
+VSCALEFPDZrrk 6399
+VSCALEFPDZrrkz 6400
+VSCALEFPHZ 6401
+VSCALEFPHZrm 6402
+VSCALEFPHZrmb 6403
+VSCALEFPHZrmbk 6404
+VSCALEFPHZrmbkz 6405
+VSCALEFPHZrmk 6406
+VSCALEFPHZrmkz 6407
+VSCALEFPHZrr 6408
+VSCALEFPHZrrb 6409
+VSCALEFPHZrrbk 6410
+VSCALEFPHZrrbkz 6411
+VSCALEFPHZrrk 6412
+VSCALEFPHZrrkz 6413
+VSCALEFPSZ 6414
+VSCALEFPSZrm 6415
+VSCALEFPSZrmb 6416
+VSCALEFPSZrmbk 6417
+VSCALEFPSZrmbkz 6418
+VSCALEFPSZrmk 6419
+VSCALEFPSZrmkz 6420
+VSCALEFPSZrr 6421
+VSCALEFPSZrrb 6422
+VSCALEFPSZrrbk 6423
+VSCALEFPSZrrbkz 6424
+VSCALEFPSZrrk 6425
+VSCALEFPSZrrkz 6426
+VSCALEFSDZrm 6427
+VSCALEFSDZrmk 6428
+VSCALEFSDZrmkz 6429
+VSCALEFSDZrr 6430
+VSCALEFSDZrrb_Int 6431
+VSCALEFSDZrrbk_Int 6432
+VSCALEFSDZrrbkz_Int 6433
+VSCALEFSDZrrk 6434
+VSCALEFSDZrrkz 6435
+VSCALEFSHZrm 6436
+VSCALEFSHZrmk 6437
+VSCALEFSHZrmkz 6438
+VSCALEFSHZrr 6439
+VSCALEFSHZrrb_Int 6440
+VSCALEFSHZrrbk_Int 6441
+VSCALEFSHZrrbkz_Int 6442
+VSCALEFSHZrrk 6443
+VSCALEFSHZrrkz 6444
+VSCALEFSSZrm 6445
+VSCALEFSSZrmk 6446
+VSCALEFSSZrmkz 6447
+VSCALEFSSZrr 6448
+VSCALEFSSZrrb_Int 6449
+VSCALEFSSZrrbk_Int 6450
+VSCALEFSSZrrbkz_Int 6451
+VSCALEFSSZrrk 6452
+VSCALEFSSZrrkz 6453
+VSCATTERDPDZ 6454
+VSCATTERDPDZmr 6455
+VSCATTERDPSZ 6456
+VSCATTERDPSZmr 6457
+VSCATTERPF 6458
+VSCATTERQPDZ 6459
+VSCATTERQPDZmr 6460
+VSCATTERQPSZ 6461
+VSCATTERQPSZmr 6462
+VSHA 6463
+VSHUFF 6464
+VSHUFI 6465
+VSHUFPDYrmi 6466
+VSHUFPDYrri 6467
+VSHUFPDZ 6468
+VSHUFPDZrmbi 6469
+VSHUFPDZrmbik 6470
+VSHUFPDZrmbikz 6471
+VSHUFPDZrmi 6472
+VSHUFPDZrmik 6473
+VSHUFPDZrmikz 6474
+VSHUFPDZrri 6475
+VSHUFPDZrrik 6476
+VSHUFPDZrrikz 6477
+VSHUFPDrmi 6478
+VSHUFPDrri 6479
+VSHUFPSYrmi 6480
+VSHUFPSYrri 6481
+VSHUFPSZ 6482
+VSHUFPSZrmbi 6483
+VSHUFPSZrmbik 6484
+VSHUFPSZrmbikz 6485
+VSHUFPSZrmi 6486
+VSHUFPSZrmik 6487
+VSHUFPSZrmikz 6488
+VSHUFPSZrri 6489
+VSHUFPSZrrik 6490
+VSHUFPSZrrikz 6491
+VSHUFPSrmi 6492
+VSHUFPSrri 6493
+VSM 6494
+VSQRTBF 6495
+VSQRTPDYm 6496
+VSQRTPDYr 6497
+VSQRTPDZ 6498
+VSQRTPDZm 6499
+VSQRTPDZmb 6500
+VSQRTPDZmbk 6501
+VSQRTPDZmbkz 6502
+VSQRTPDZmk 6503
+VSQRTPDZmkz 6504
+VSQRTPDZr 6505
+VSQRTPDZrb 6506
+VSQRTPDZrbk 6507
+VSQRTPDZrbkz 6508
+VSQRTPDZrk 6509
+VSQRTPDZrkz 6510
+VSQRTPDm 6511
+VSQRTPDr 6512
+VSQRTPHZ 6513
+VSQRTPHZm 6514
+VSQRTPHZmb 6515
+VSQRTPHZmbk 6516
+VSQRTPHZmbkz 6517
+VSQRTPHZmk 6518
+VSQRTPHZmkz 6519
+VSQRTPHZr 6520
+VSQRTPHZrb 6521
+VSQRTPHZrbk 6522
+VSQRTPHZrbkz 6523
+VSQRTPHZrk 6524
+VSQRTPHZrkz 6525
+VSQRTPSYm 6526
+VSQRTPSYr 6527
+VSQRTPSZ 6528
+VSQRTPSZm 6529
+VSQRTPSZmb 6530
+VSQRTPSZmbk 6531
+VSQRTPSZmbkz 6532
+VSQRTPSZmk 6533
+VSQRTPSZmkz 6534
+VSQRTPSZr 6535
+VSQRTPSZrb 6536
+VSQRTPSZrbk 6537
+VSQRTPSZrbkz 6538
+VSQRTPSZrk 6539
+VSQRTPSZrkz 6540
+VSQRTPSm 6541
+VSQRTPSr 6542
+VSQRTSDZm 6543
+VSQRTSDZm_Int 6544
+VSQRTSDZmk_Int 6545
+VSQRTSDZmkz_Int 6546
+VSQRTSDZr 6547
+VSQRTSDZr_Int 6548
+VSQRTSDZrb_Int 6549
+VSQRTSDZrbk_Int 6550
+VSQRTSDZrbkz_Int 6551
+VSQRTSDZrk_Int 6552
+VSQRTSDZrkz_Int 6553
+VSQRTSDm 6554
+VSQRTSDm_Int 6555
+VSQRTSDr 6556
+VSQRTSDr_Int 6557
+VSQRTSHZm 6558
+VSQRTSHZm_Int 6559
+VSQRTSHZmk_Int 6560
+VSQRTSHZmkz_Int 6561
+VSQRTSHZr 6562
+VSQRTSHZr_Int 6563
+VSQRTSHZrb_Int 6564
+VSQRTSHZrbk_Int 6565
+VSQRTSHZrbkz_Int 6566
+VSQRTSHZrk_Int 6567
+VSQRTSHZrkz_Int 6568
+VSQRTSSZm 6569
+VSQRTSSZm_Int 6570
+VSQRTSSZmk_Int 6571
+VSQRTSSZmkz_Int 6572
+VSQRTSSZr 6573
+VSQRTSSZr_Int 6574
+VSQRTSSZrb_Int 6575
+VSQRTSSZrbk_Int 6576
+VSQRTSSZrbkz_Int 6577
+VSQRTSSZrk_Int 6578
+VSQRTSSZrkz_Int 6579
+VSQRTSSm 6580
+VSQRTSSm_Int 6581
+VSQRTSSr 6582
+VSQRTSSr_Int 6583
+VSTMXCSR 6584
+VSUBBF 6585
+VSUBPDYrm 6586
+VSUBPDYrr 6587
+VSUBPDZ 6588
+VSUBPDZrm 6589
+VSUBPDZrmb 6590
+VSUBPDZrmbk 6591
+VSUBPDZrmbkz 6592
+VSUBPDZrmk 6593
+VSUBPDZrmkz 6594
+VSUBPDZrr 6595
+VSUBPDZrrb 6596
+VSUBPDZrrbk 6597
+VSUBPDZrrbkz 6598
+VSUBPDZrrk 6599
+VSUBPDZrrkz 6600
+VSUBPDrm 6601
+VSUBPDrr 6602
+VSUBPHZ 6603
+VSUBPHZrm 6604
+VSUBPHZrmb 6605
+VSUBPHZrmbk 6606
+VSUBPHZrmbkz 6607
+VSUBPHZrmk 6608
+VSUBPHZrmkz 6609
+VSUBPHZrr 6610
+VSUBPHZrrb 6611
+VSUBPHZrrbk 6612
+VSUBPHZrrbkz 6613
+VSUBPHZrrk 6614
+VSUBPHZrrkz 6615
+VSUBPSYrm 6616
+VSUBPSYrr 6617
+VSUBPSZ 6618
+VSUBPSZrm 6619
+VSUBPSZrmb 6620
+VSUBPSZrmbk 6621
+VSUBPSZrmbkz 6622
+VSUBPSZrmk 6623
+VSUBPSZrmkz 6624
+VSUBPSZrr 6625
+VSUBPSZrrb 6626
+VSUBPSZrrbk 6627
+VSUBPSZrrbkz 6628
+VSUBPSZrrk 6629
+VSUBPSZrrkz 6630
+VSUBPSrm 6631
+VSUBPSrr 6632
+VSUBSDZrm 6633
+VSUBSDZrm_Int 6634
+VSUBSDZrmk_Int 6635
+VSUBSDZrmkz_Int 6636
+VSUBSDZrr 6637
+VSUBSDZrr_Int 6638
+VSUBSDZrrb_Int 6639
+VSUBSDZrrbk_Int 6640
+VSUBSDZrrbkz_Int 6641
+VSUBSDZrrk_Int 6642
+VSUBSDZrrkz_Int 6643
+VSUBSDrm 6644
+VSUBSDrm_Int 6645
+VSUBSDrr 6646
+VSUBSDrr_Int 6647
+VSUBSHZrm 6648
+VSUBSHZrm_Int 6649
+VSUBSHZrmk_Int 6650
+VSUBSHZrmkz_Int 6651
+VSUBSHZrr 6652
+VSUBSHZrr_Int 6653
+VSUBSHZrrb_Int 6654
+VSUBSHZrrbk_Int 6655
+VSUBSHZrrbkz_Int 6656
+VSUBSHZrrk_Int 6657
+VSUBSHZrrkz_Int 6658
+VSUBSSZrm 6659
+VSUBSSZrm_Int 6660
+VSUBSSZrmk_Int 6661
+VSUBSSZrmkz_Int 6662
+VSUBSSZrr 6663
+VSUBSSZrr_Int 6664
+VSUBSSZrrb_Int 6665
+VSUBSSZrrbk_Int 6666
+VSUBSSZrrbkz_Int 6667
+VSUBSSZrrk_Int 6668
+VSUBSSZrrkz_Int 6669
+VSUBSSrm 6670
+VSUBSSrm_Int 6671
+VSUBSSrr 6672
+VSUBSSrr_Int 6673
+VTESTPDYrm 6674
+VTESTPDYrr 6675
+VTESTPDrm 6676
+VTESTPDrr 6677
+VTESTPSYrm 6678
+VTESTPSYrr 6679
+VTESTPSrm 6680
+VTESTPSrr 6681
+VUCOMISDZrm 6682
+VUCOMISDZrm_Int 6683
+VUCOMISDZrr 6684
+VUCOMISDZrr_Int 6685
+VUCOMISDZrrb 6686
+VUCOMISDrm 6687
+VUCOMISDrm_Int 6688
+VUCOMISDrr 6689
+VUCOMISDrr_Int 6690
+VUCOMISHZrm 6691
+VUCOMISHZrm_Int 6692
+VUCOMISHZrr 6693
+VUCOMISHZrr_Int 6694
+VUCOMISHZrrb 6695
+VUCOMISSZrm 6696
+VUCOMISSZrm_Int 6697
+VUCOMISSZrr 6698
+VUCOMISSZrr_Int 6699
+VUCOMISSZrrb 6700
+VUCOMISSrm 6701
+VUCOMISSrm_Int 6702
+VUCOMISSrr 6703
+VUCOMISSrr_Int 6704
+VUCOMXSDZrm 6705
+VUCOMXSDZrm_Int 6706
+VUCOMXSDZrr 6707
+VUCOMXSDZrr_Int 6708
+VUCOMXSDZrrb_Int 6709
+VUCOMXSHZrm 6710
+VUCOMXSHZrm_Int 6711
+VUCOMXSHZrr 6712
+VUCOMXSHZrr_Int 6713
+VUCOMXSHZrrb_Int 6714
+VUCOMXSSZrm 6715
+VUCOMXSSZrm_Int 6716
+VUCOMXSSZrr 6717
+VUCOMXSSZrr_Int 6718
+VUCOMXSSZrrb_Int 6719
+VUNPCKHPDYrm 6720
+VUNPCKHPDYrr 6721
+VUNPCKHPDZ 6722
+VUNPCKHPDZrm 6723
+VUNPCKHPDZrmb 6724
+VUNPCKHPDZrmbk 6725
+VUNPCKHPDZrmbkz 6726
+VUNPCKHPDZrmk 6727
+VUNPCKHPDZrmkz 6728
+VUNPCKHPDZrr 6729
+VUNPCKHPDZrrk 6730
+VUNPCKHPDZrrkz 6731
+VUNPCKHPDrm 6732
+VUNPCKHPDrr 6733
+VUNPCKHPSYrm 6734
+VUNPCKHPSYrr 6735
+VUNPCKHPSZ 6736
+VUNPCKHPSZrm 6737
+VUNPCKHPSZrmb 6738
+VUNPCKHPSZrmbk 6739
+VUNPCKHPSZrmbkz 6740
+VUNPCKHPSZrmk 6741
+VUNPCKHPSZrmkz 6742
+VUNPCKHPSZrr 6743
+VUNPCKHPSZrrk 6744
+VUNPCKHPSZrrkz 6745
+VUNPCKHPSrm 6746
+VUNPCKHPSrr 6747
+VUNPCKLPDYrm 6748
+VUNPCKLPDYrr 6749
+VUNPCKLPDZ 6750
+VUNPCKLPDZrm 6751
+VUNPCKLPDZrmb 6752
+VUNPCKLPDZrmbk 6753
+VUNPCKLPDZrmbkz 6754
+VUNPCKLPDZrmk 6755
+VUNPCKLPDZrmkz 6756
+VUNPCKLPDZrr 6757
+VUNPCKLPDZrrk 6758
+VUNPCKLPDZrrkz 6759
+VUNPCKLPDrm 6760
+VUNPCKLPDrr 6761
+VUNPCKLPSYrm 6762
+VUNPCKLPSYrr 6763
+VUNPCKLPSZ 6764
+VUNPCKLPSZrm 6765
+VUNPCKLPSZrmb 6766
+VUNPCKLPSZrmbk 6767
+VUNPCKLPSZrmbkz 6768
+VUNPCKLPSZrmk 6769
+VUNPCKLPSZrmkz 6770
+VUNPCKLPSZrr 6771
+VUNPCKLPSZrrk 6772
+VUNPCKLPSZrrkz 6773
+VUNPCKLPSrm 6774
+VUNPCKLPSrr 6775
+VXORPDYrm 6776
+VXORPDYrr 6777
+VXORPDZ 6778
+VXORPDZrm 6779
+VXORPDZrmb 6780
+VXORPDZrmbk 6781
+VXORPDZrmbkz 6782
+VXORPDZrmk 6783
+VXORPDZrmkz 6784
+VXORPDZrr 6785
+VXORPDZrrk 6786
+VXORPDZrrkz 6787
+VXORPDrm 6788
+VXORPDrr 6789
+VXORPSYrm 6790
+VXORPSYrr 6791
+VXORPSZ 6792
+VXORPSZrm 6793
+VXORPSZrmb 6794
+VXORPSZrmbk 6795
+VXORPSZrmbkz 6796
+VXORPSZrmk 6797
+VXORPSZrmkz 6798
+VXORPSZrr 6799
+VXORPSZrrk 6800
+VXORPSZrrkz 6801
+VXORPSrm 6802
+VXORPSrr 6803
+VZEROALL 6804
+VZEROUPPER 6805
+V_SET 6806
+V_SETALLONES 6807
+WAIT 6808
+WBINVD 6809
+WBNOINVD 6810
+WRFLAGS 6811
+WRFSBASE 6812
+WRGSBASE 6813
+WRMSR 6814
+WRMSRLIST 6815
+WRMSRNS 6816
+WRMSRNSir 6817
+WRMSRNSir_EVEX 6818
+WRPKRUr 6819
+WRSSD 6820
+WRSSD_EVEX 6821
+WRSSQ 6822
+WRSSQ_EVEX 6823
+WRUSSD 6824
+WRUSSD_EVEX 6825
+WRUSSQ 6826
+WRUSSQ_EVEX 6827
+XABORT 6828
+XABORT_DEF 6829
+XACQUIRE_PREFIX 6830
+XADD 6831
+XAM_F 6832
+XAM_Fp 6833
+XBEGIN 6834
+XCHG 6835
+XCH_F 6836
+XCRYPTCBC 6837
+XCRYPTCFB 6838
+XCRYPTCTR 6839
+XCRYPTECB 6840
+XCRYPTOFB 6841
+XEND 6842
+XGETBV 6843
+XLAT 6844
+XOR 6845
+XORPDrm 6846
+XORPDrr 6847
+XORPSrm 6848
+XORPSrr 6849
+XRELEASE_PREFIX 6850
+XRESLDTRK 6851
+XRSTOR 6852
+XRSTORS 6853
+XSAVE 6854
+XSAVEC 6855
+XSAVEOPT 6856
+XSAVES 6857
+XSETBV 6858
+XSHA 6859
+XSTORE 6860
+XSUSLDTRK 6861
+XTEST 6862
+Immediate 6863
+CImmediate 6864
+FPImmediate 6865
+MBB 6866
+FrameIndex 6867
+ConstantPoolIndex 6868
+TargetIndex 6869
+JumpTableIndex 6870
+ExternalSymbol 6871
+GlobalAddress 6872
+BlockAddress 6873
+RegisterMask 6874
+RegisterLiveOut 6875
+Metadata 6876
+MCSymbol 6877
+CFIIndex 6878
+IntrinsicID 6879
+Predicate 6880
+ShuffleMask 6881
+PhyReg_GR8 6882
+PhyReg_GRH8 6883
+PhyReg_GR8_NOREX2 6884
+PhyReg_GR8_NOREX 6885
+PhyReg_GR8_ABCD_H 6886
+PhyReg_GR8_ABCD_L 6887
+PhyReg_GRH16 6888
+PhyReg_GR16 6889
+PhyReg_GR16_NOREX2 6890
+PhyReg_GR16_NOREX 6891
+PhyReg_VK1 6892
+PhyReg_VK16 6893
+PhyReg_VK2 6894
+PhyReg_VK4 6895
+PhyReg_VK8 6896
+PhyReg_VK16WM 6897
+PhyReg_VK1WM 6898
+PhyReg_VK2WM 6899
+PhyReg_VK4WM 6900
+PhyReg_VK8WM 6901
+PhyReg_SEGMENT_REG 6902
+PhyReg_GR16_ABCD 6903
+PhyReg_FPCCR 6904
+PhyReg_FR16X 6905
+PhyReg_FR16 6906
+PhyReg_VK16PAIR 6907
+PhyReg_VK1PAIR 6908
+PhyReg_VK2PAIR 6909
+PhyReg_VK4PAIR 6910
+PhyReg_VK8PAIR 6911
+PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM 6912
+PhyReg_LOW32_ADDR_ACCESS_RBP 6913
+PhyReg_LOW32_ADDR_ACCESS 6914
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit 6915
+PhyReg_FR32X 6916
+PhyReg_GR32 6917
+PhyReg_GR32_NOSP 6918
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2 6919
+PhyReg_DEBUG_REG 6920
+PhyReg_FR32 6921
+PhyReg_GR32_NOREX2 6922
+PhyReg_GR32_NOREX2_NOSP 6923
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX 6924
+PhyReg_GR32_NOREX 6925
+PhyReg_VK32 6926
+PhyReg_GR32_NOREX_NOSP 6927
+PhyReg_RFP32 6928
+PhyReg_VK32WM 6929
+PhyReg_GR32_ABCD 6930
+PhyReg_GR32_TC 6931
+PhyReg_GR32_ABCD_and_GR32_TC 6932
+PhyReg_GR32_AD 6933
+PhyReg_GR32_ArgRef 6934
+PhyReg_GR32_BPSP 6935
+PhyReg_GR32_BSI 6936
+PhyReg_GR32_CB 6937
+PhyReg_GR32_DC 6938
+PhyReg_GR32_DIBP 6939
+PhyReg_GR32_SIDI 6940
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit 6941
+PhyReg_CCR 6942
+PhyReg_DFCCR 6943
+PhyReg_GR32_ABCD_and_GR32_BSI 6944
+PhyReg_GR32_AD_and_GR32_ArgRef 6945
+PhyReg_GR32_ArgRef_and_GR32_CB 6946
+PhyReg_GR32_BPSP_and_GR32_DIBP 6947
+PhyReg_GR32_BPSP_and_GR32_TC 6948
+PhyReg_GR32_BSI_and_GR32_SIDI 6949
+PhyReg_GR32_DIBP_and_GR32_SIDI 6950
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit 6951
+PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit 6952
+PhyReg_RFP64 6953
+PhyReg_GR64 6954
+PhyReg_FR64X 6955
+PhyReg_GR64_with_sub_8bit 6956
+PhyReg_GR64_NOSP 6957
+PhyReg_GR64_NOREX2 6958
+PhyReg_CONTROL_REG 6959
+PhyReg_FR64 6960
+PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2 6961
+PhyReg_GR64_NOREX2_NOSP 6962
+PhyReg_GR64PLTSafe 6963
+PhyReg_GR64_TC 6964
+PhyReg_GR64_NOREX 6965
+PhyReg_GR64_TCW64 6966
+PhyReg_GR64_TC_with_sub_8bit 6967
+PhyReg_GR64_NOREX2_NOSP_and_GR64_TC 6968
+PhyReg_GR64_TCW64_with_sub_8bit 6969
+PhyReg_GR64_TC_and_GR64_TCW64 6970
+PhyReg_GR64_with_sub_16bit_in_GR16_NOREX 6971
+PhyReg_VK64 6972
+PhyReg_VR64 6973
+PhyReg_GR64PLTSafe_and_GR64_TC 6974
+PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64 6975
+PhyReg_GR64_NOREX_NOSP 6976
+PhyReg_GR64_NOREX_and_GR64_TC 6977
+PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit 6978
+PhyReg_VK64WM 6979
+PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64 6980
+PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX 6981
+PhyReg_GR64PLTSafe_and_GR64_TCW64 6982
+PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC 6983
+PhyReg_GR64_NOREX_and_GR64_TCW64 6984
+PhyReg_GR64_ABCD 6985
+PhyReg_GR64_with_sub_32bit_in_GR32_TC 6986
+PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC 6987
+PhyReg_GR64_AD 6988
+PhyReg_GR64_ArgRef 6989
+PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP 6990
+PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef 6991
+PhyReg_GR64_with_sub_32bit_in_GR32_BPSP 6992
+PhyReg_GR64_with_sub_32bit_in_GR32_BSI 6993
+PhyReg_GR64_with_sub_32bit_in_GR32_CB 6994
+PhyReg_GR64_with_sub_32bit_in_GR32_DIBP 6995
+PhyReg_GR64_with_sub_32bit_in_GR32_SIDI 6996
+PhyReg_GR64_A 6997
+PhyReg_GR64_ArgRef_and_GR64_TC 6998
+PhyReg_GR64_and_LOW32_ADDR_ACCESS 6999
+PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI 7000
+PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef 7001
+PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB 7002
+PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP 7003
+PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC 7004
+PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI 7005
+PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI 7006
+PhyReg_RST 7007
+PhyReg_RFP80 7008
+PhyReg_RFP80_7 7009
+PhyReg_VR128X 7010
+PhyReg_VR128 7011
+PhyReg_VR256X 7012
+PhyReg_VR256 7013
+PhyReg_VR512 7014
+PhyReg_VR512_0_15 7015
+PhyReg_TILE 7016
+VirtReg_GR8 7017
+VirtReg_GRH8 7018
+VirtReg_GR8_NOREX2 7019
+VirtReg_GR8_NOREX 7020
+VirtReg_GR8_ABCD_H 7021
+VirtReg_GR8_ABCD_L 7022
+VirtReg_GRH16 7023
+VirtReg_GR16 7024
+VirtReg_GR16_NOREX2 7025
+VirtReg_GR16_NOREX 7026
+VirtReg_VK1 7027
+VirtReg_VK16 7028
+VirtReg_VK2 7029
+VirtReg_VK4 7030
+VirtReg_VK8 7031
+VirtReg_VK16WM 7032
+VirtReg_VK1WM 7033
+VirtReg_VK2WM 7034
+VirtReg_VK4WM 7035
+VirtReg_VK8WM 7036
+VirtReg_SEGMENT_REG 7037
+VirtReg_GR16_ABCD 7038
+VirtReg_FPCCR 7039
+VirtReg_FR16X 7040
+VirtReg_FR16 7041
+VirtReg_VK16PAIR 7042
+VirtReg_VK1PAIR 7043
+VirtReg_VK2PAIR 7044
+VirtReg_VK4PAIR 7045
+VirtReg_VK8PAIR 7046
+VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM 7047
+VirtReg_LOW32_ADDR_ACCESS_RBP 7048
+VirtReg_LOW32_ADDR_ACCESS 7049
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit 7050
+VirtReg_FR32X 7051
+VirtReg_GR32 7052
+VirtReg_GR32_NOSP 7053
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2 7054
+VirtReg_DEBUG_REG 7055
+VirtReg_FR32 7056
+VirtReg_GR32_NOREX2 7057
+VirtReg_GR32_NOREX2_NOSP 7058
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX 7059
+VirtReg_GR32_NOREX 7060
+VirtReg_VK32 7061
+VirtReg_GR32_NOREX_NOSP 7062
+VirtReg_RFP32 7063
+VirtReg_VK32WM 7064
+VirtReg_GR32_ABCD 7065
+VirtReg_GR32_TC 7066
+VirtReg_GR32_ABCD_and_GR32_TC 7067
+VirtReg_GR32_AD 7068
+VirtReg_GR32_ArgRef 7069
+VirtReg_GR32_BPSP 7070
+VirtReg_GR32_BSI 7071
+VirtReg_GR32_CB 7072
+VirtReg_GR32_DC 7073
+VirtReg_GR32_DIBP 7074
+VirtReg_GR32_SIDI 7075
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit 7076
+VirtReg_CCR 7077
+VirtReg_DFCCR 7078
+VirtReg_GR32_ABCD_and_GR32_BSI 7079
+VirtReg_GR32_AD_and_GR32_ArgRef 7080
+VirtReg_GR32_ArgRef_and_GR32_CB 7081
+VirtReg_GR32_BPSP_and_GR32_DIBP 7082
+VirtReg_GR32_BPSP_and_GR32_TC 7083
+VirtReg_GR32_BSI_and_GR32_SIDI 7084
+VirtReg_GR32_DIBP_and_GR32_SIDI 7085
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit 7086
+VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit 7087
+VirtReg_RFP64 7088
+VirtReg_GR64 7089
+VirtReg_FR64X 7090
+VirtReg_GR64_with_sub_8bit 7091
+VirtReg_GR64_NOSP 7092
+VirtReg_GR64_NOREX2 7093
+VirtReg_CONTROL_REG 7094
+VirtReg_FR64 7095
+VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2 7096
+VirtReg_GR64_NOREX2_NOSP 7097
+VirtReg_GR64PLTSafe 7098
+VirtReg_GR64_TC 7099
+VirtReg_GR64_NOREX 7100
+VirtReg_GR64_TCW64 7101
+VirtReg_GR64_TC_with_sub_8bit 7102
+VirtReg_GR64_NOREX2_NOSP_and_GR64_TC 7103
+VirtReg_GR64_TCW64_with_sub_8bit 7104
+VirtReg_GR64_TC_and_GR64_TCW64 7105
+VirtReg_GR64_with_sub_16bit_in_GR16_NOREX 7106
+VirtReg_VK64 7107
+VirtReg_VR64 7108
+VirtReg_GR64PLTSafe_and_GR64_TC 7109
+VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64 7110
+VirtReg_GR64_NOREX_NOSP 7111
+VirtReg_GR64_NOREX_and_GR64_TC 7112
+VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit 7113
+VirtReg_VK64WM 7114
+VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64 7115
+VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX 7116
+VirtReg_GR64PLTSafe_and_GR64_TCW64 7117
+VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC 7118
+VirtReg_GR64_NOREX_and_GR64_TCW64 7119
+VirtReg_GR64_ABCD 7120
+VirtReg_GR64_with_sub_32bit_in_GR32_TC 7121
+VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC 7122
+VirtReg_GR64_AD 7123
+VirtReg_GR64_ArgRef 7124
+VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP 7125
+VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef 7126
+VirtReg_GR64_with_sub_32bit_in_GR32_BPSP 7127
+VirtReg_GR64_with_sub_32bit_in_GR32_BSI 7128
+VirtReg_GR64_with_sub_32bit_in_GR32_CB 7129
+VirtReg_GR64_with_sub_32bit_in_GR32_DIBP 7130
+VirtReg_GR64_with_sub_32bit_in_GR32_SIDI 7131
+VirtReg_GR64_A 7132
+VirtReg_GR64_ArgRef_and_GR64_TC 7133
+VirtReg_GR64_and_LOW32_ADDR_ACCESS 7134
+VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI 7135
+VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef 7136
+VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB 7137
+VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP 7138
+VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC 7139
+VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI 7140
+VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI 7141
+VirtReg_RST 7142
+VirtReg_RFP80 7143
+VirtReg_RFP80_7 7144
+VirtReg_VR128X 7145
+VirtReg_VR128 7146
+VirtReg_VR256X 7147
+VirtReg_VR256 7148
+VirtReg_VR512 7149
+VirtReg_VR512_0_15 7150
+VirtReg_TILE 7151
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s
index 3d7a67d..621cad6 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s
@@ -630,593 +630,593 @@ vfwredusum.vs v8, v8, v8
# CHECK: [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDAND_VS vredand.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDAND_VS vredand.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAXU_VS vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMAXU_VS vredmaxu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMAX_VS vredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMAX_VS vredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMINU_VS vredminu.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMINU_VS vredminu.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDMIN_VS vredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDMIN_VS vredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDOR_VS vredor.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDOR_VS vredor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDSUM_VS vredsum.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDSUM_VS vredsum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VREDXOR_VS vredxor.vs v8, v8, v8
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VREDXOR_VS vredxor.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VWREDSUMU_VS vwredsumu.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VWREDSUMU_VS vwredsumu.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VWREDSUMU_VS vwredsumu.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VWREDSUMU_VS vwredsumu.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VWREDSUMU_VS vwredsumu.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VWREDSUMU_VS vwredsumu.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VWREDSUMU_VS vwredsumu.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VWREDSUMU_VS vwredsumu.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VWREDSUMU_VS vwredsumu.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VWREDSUMU_VS vwredsumu.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VWREDSUMU_VS vwredsumu.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VWREDSUMU_VS vwredsumu.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VWREDSUMU_VS vwredsumu.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VWREDSUMU_VS vwredsumu.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUMU_VS vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VWREDSUMU_VS vwredsumu.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VWREDSUM_VS vwredsum.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VWREDSUM_VS vwredsum.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VWREDSUM_VS vwredsum.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VWREDSUM_VS vwredsum.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VWREDSUM_VS vwredsum.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VWREDSUM_VS vwredsum.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
+# CHECK-NEXT: 1 5 1.00 5 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VWREDSUM_VS vwredsum.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VWREDSUM_VS vwredsum.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VWREDSUM_VS vwredsum.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VWREDSUM_VS vwredsum.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
+# CHECK-NEXT: 1 5 2.00 5 SMX60_VIEU[2] VWREDSUM_VS vwredsum.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
+# CHECK-NEXT: 1 7 2.00 7 SMX60_VIEU[2] VWREDSUM_VS vwredsum.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
+# CHECK-NEXT: 1 11 4.00 11 SMX60_VIEU[4] VWREDSUM_VS vwredsum.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
+# CHECK-NEXT: 1 19 10.00 19 SMX60_VIEU[10] VWREDSUM_VS vwredsum.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VIEU VWREDSUM_VS vwredsum.vs v8, v16, v24
+# CHECK-NEXT: 1 35 35.00 35 SMX60_VIEU[35] VWREDSUM_VS vwredsum.vs v8, v16, v24
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDMAX_VS vfredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDMAX_VS vfredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 15 8.00 15 SMX60_VFP[8] VFREDMAX_VS vfredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 21 14.00 21 SMX60_VFP[14] VFREDMAX_VS vfredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 33 20.00 33 SMX60_VFP[20] VFREDMAX_VS vfredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 57 57.00 57 SMX60_VFP[57] VFREDMAX_VS vfredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDMAX_VS vfredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 15 8.00 15 SMX60_VFP[8] VFREDMAX_VS vfredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 21 14.00 21 SMX60_VFP[14] VFREDMAX_VS vfredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 33 20.00 33 SMX60_VFP[20] VFREDMAX_VS vfredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 57 57.00 57 SMX60_VFP[57] VFREDMAX_VS vfredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 15 8.00 15 SMX60_VFP[8] VFREDMAX_VS vfredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 21 14.00 21 SMX60_VFP[14] VFREDMAX_VS vfredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 33 20.00 33 SMX60_VFP[20] VFREDMAX_VS vfredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMAX_VS vfredmax.vs v8, v8, v8
+# CHECK-NEXT: 1 57 57.00 57 SMX60_VFP[57] VFREDMAX_VS vfredmax.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDMIN_VS vfredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDMIN_VS vfredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 15 8.00 15 SMX60_VFP[8] VFREDMIN_VS vfredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 21 14.00 21 SMX60_VFP[14] VFREDMIN_VS vfredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 33 20.00 33 SMX60_VFP[20] VFREDMIN_VS vfredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 57 57.00 57 SMX60_VFP[57] VFREDMIN_VS vfredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDMIN_VS vfredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 15 8.00 15 SMX60_VFP[8] VFREDMIN_VS vfredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 21 14.00 21 SMX60_VFP[14] VFREDMIN_VS vfredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 33 20.00 33 SMX60_VFP[20] VFREDMIN_VS vfredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 57 57.00 57 SMX60_VFP[57] VFREDMIN_VS vfredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 15 8.00 15 SMX60_VFP[8] VFREDMIN_VS vfredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 21 14.00 21 SMX60_VFP[14] VFREDMIN_VS vfredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 33 20.00 33 SMX60_VFP[20] VFREDMIN_VS vfredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDMIN_VS vfredmin.vs v8, v8, v8
+# CHECK-NEXT: 1 57 57.00 57 SMX60_VFP[57] VFREDMIN_VS vfredmin.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 24 20.00 24 SMX60_VFP[20] VFREDOSUM_VS vfredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDOSUM_VS vfredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 48 24.00 48 SMX60_VFP[24] VFREDOSUM_VS vfredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 96 48.00 96 SMX60_VFP[48] VFREDOSUM_VS vfredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 192 96.00 192 SMX60_VFP[96] VFREDOSUM_VS vfredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 384 384.00 384 SMX60_VFP[384] VFREDOSUM_VS vfredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDOSUM_VS vfredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VFP[12] VFREDOSUM_VS vfredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 48 24.00 48 SMX60_VFP[24] VFREDOSUM_VS vfredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 96 48.00 96 SMX60_VFP[48] VFREDOSUM_VS vfredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 192 192.00 192 SMX60_VFP[192] VFREDOSUM_VS vfredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 12 6.00 12 SMX60_VFP[6] VFREDOSUM_VS vfredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 24 12.00 24 SMX60_VFP[12] VFREDOSUM_VS vfredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 48 24.00 48 SMX60_VFP[24] VFREDOSUM_VS vfredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDOSUM_VS vfredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 96 96.00 96 SMX60_VFP[96] VFREDOSUM_VS vfredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDUSUM_VS vfredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDUSUM_VS vfredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 15 8.00 15 SMX60_VFP[8] VFREDUSUM_VS vfredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 21 14.00 21 SMX60_VFP[14] VFREDUSUM_VS vfredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 33 20.00 33 SMX60_VFP[20] VFREDUSUM_VS vfredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 57 57.00 57 SMX60_VFP[57] VFREDUSUM_VS vfredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 12 8.00 12 SMX60_VFP[8] VFREDUSUM_VS vfredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 15 8.00 15 SMX60_VFP[8] VFREDUSUM_VS vfredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 21 14.00 21 SMX60_VFP[14] VFREDUSUM_VS vfredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 33 20.00 33 SMX60_VFP[20] VFREDUSUM_VS vfredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 57 57.00 57 SMX60_VFP[57] VFREDUSUM_VS vfredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 15 8.00 15 SMX60_VFP[8] VFREDUSUM_VS vfredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 21 14.00 21 SMX60_VFP[14] VFREDUSUM_VS vfredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 33 20.00 33 SMX60_VFP[20] VFREDUSUM_VS vfredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFREDUSUM_VS vfredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 57 57.00 57 SMX60_VFP[57] VFREDUSUM_VS vfredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 32 27.00 32 SMX60_VFP[27] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 16 11.00 16 SMX60_VFP[11] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 64 32.00 64 SMX60_VFP[32] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 128 64.00 128 SMX60_VFP[64] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 256 128.00 256 SMX60_VFP[128] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 512 512.00 512 SMX60_VFP[512] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 16 11.00 16 SMX60_VFP[11] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 32 16.00 32 SMX60_VFP[16] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 64 32.00 64 SMX60_VFP[32] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 128 64.00 128 SMX60_VFP[64] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: 1 256 256.00 256 SMX60_VFP[256] VFWREDOSUM_VS vfwredosum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 32 27.00 32 SMX60_VFP[27] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 16 11.00 16 SMX60_VFP[11] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 64 32.00 64 SMX60_VFP[32] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 128 64.00 128 SMX60_VFP[64] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 256 128.00 256 SMX60_VFP[128] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 512 512.00 512 SMX60_VFP[512] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 16 11.00 16 SMX60_VFP[11] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 32 16.00 32 SMX60_VFP[16] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 64 32.00 64 SMX60_VFP[32] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 128 64.00 128 SMX60_VFP[64] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
# CHECK-NEXT: 1 1 1.00 U 1 SMX60_IEU,SMX60_IEUA VSETVLI vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: 1 1 1.00 1 SMX60_VFP VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: 1 256 256.00 256 SMX60_VFP[256] VFWREDUSUM_VS vfwredusum.vs v8, v8, v8
# CHECK: Resources:
# CHECK-NEXT: [0] - SMX60_FP
@@ -1230,595 +1230,595 @@ vfwredusum.vs v8, v8, v8
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6]
-# CHECK-NEXT: - 294.00 - - - 82.00 212.00 -
+# CHECK-NEXT: - 294.00 - - - 4271.00 2028.00 -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] Instructions:
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredand.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredand.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmaxu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredmaxu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredminu.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredminu.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredsum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredsum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 2.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 4.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 10.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vredxor.vs v8, v8, v8
+# CHECK-NEXT: - - - - - - 35.00 - vredxor.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 2.00 - vwredsumu.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 2.00 - vwredsumu.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwredsumu.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 10.00 - vwredsumu.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 35.00 - vwredsumu.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 2.00 - vwredsumu.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 2.00 - vwredsumu.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwredsumu.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 10.00 - vwredsumu.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 35.00 - vwredsumu.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 2.00 - vwredsumu.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 2.00 - vwredsumu.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwredsumu.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 10.00 - vwredsumu.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsumu.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 35.00 - vwredsumu.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 2.00 - vwredsum.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, mf8, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 2.00 - vwredsum.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwredsum.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 10.00 - vwredsum.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e8, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 35.00 - vwredsum.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 2.00 - vwredsum.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 2.00 - vwredsum.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwredsum.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 10.00 - vwredsum.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 35.00 - vwredsum.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 2.00 - vwredsum.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 2.00 - vwredsum.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 4.00 - vwredsum.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 10.00 - vwredsum.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - - 1.00 - vwredsum.vs v8, v16, v24
+# CHECK-NEXT: - - - - - - 35.00 - vwredsum.vs v8, v16, v24
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 14.00 - - vfredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 20.00 - - vfredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 57.00 - - vfredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 14.00 - - vfredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 20.00 - - vfredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 57.00 - - vfredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 14.00 - - vfredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 20.00 - - vfredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmax.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 57.00 - - vfredmax.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 14.00 - - vfredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 20.00 - - vfredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 57.00 - - vfredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 14.00 - - vfredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 20.00 - - vfredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 57.00 - - vfredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 14.00 - - vfredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 20.00 - - vfredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredmin.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 57.00 - - vfredmin.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 20.00 - - vfredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 24.00 - - vfredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 48.00 - - vfredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 96.00 - - vfredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 384.00 - - vfredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 12.00 - - vfredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 24.00 - - vfredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 48.00 - - vfredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 192.00 - - vfredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 6.00 - - vfredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 12.00 - - vfredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 24.00 - - vfredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 96.00 - - vfredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 14.00 - - vfredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 20.00 - - vfredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 57.00 - - vfredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 14.00 - - vfredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 20.00 - - vfredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 57.00 - - vfredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m1, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 8.00 - - vfredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 14.00 - - vfredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 20.00 - - vfredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e64, m8, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 57.00 - - vfredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 27.00 - - vfwredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 11.00 - - vfwredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 32.00 - - vfwredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 64.00 - - vfwredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 128.00 - - vfwredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 512.00 - - vfwredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 11.00 - - vfwredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 16.00 - - vfwredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 32.00 - - vfwredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 64.00 - - vfwredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredosum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 256.00 - - vfwredosum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 27.00 - - vfwredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 11.00 - - vfwredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m1, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 32.00 - - vfwredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 64.00 - - vfwredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 128.00 - - vfwredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e16, m8, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 512.00 - - vfwredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 11.00 - - vfwredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m1, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 16.00 - - vfwredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m2, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 32.00 - - vfwredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m4, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 64.00 - - vfwredusum.vs v8, v8, v8
# CHECK-NEXT: - 1.00 - - - - - - vsetvli t3, zero, e32, m8, tu, mu
-# CHECK-NEXT: - - - - - 1.00 - - vfwredusum.vs v8, v8, v8
+# CHECK-NEXT: - - - - - 256.00 - - vfwredusum.vs v8, v8, v8
diff --git a/llvm/test/tools/llvm-profdata/profile-version.test b/llvm/test/tools/llvm-profdata/profile-version.test
index cb68a64..e811699 100644
--- a/llvm/test/tools/llvm-profdata/profile-version.test
+++ b/llvm/test/tools/llvm-profdata/profile-version.test
@@ -2,7 +2,7 @@ Test the profile version.
RUN: llvm-profdata merge -o %t.profdata %p/Inputs/basic.proftext
RUN: llvm-profdata show --profile-version %t.profdata | FileCheck %s
-CHECK: Profile version: 12
+CHECK: Profile version: 13
RUN: llvm-profdata merge -o %t.prev.profdata %p/Inputs/basic.proftext --write-prev-version
RUN: llvm-profdata show --profile-version %t.prev.profdata | FileCheck %s --check-prefix=PREV
diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp
index 972dac8..1142c55 100644
--- a/llvm/unittests/IR/PatternMatch.cpp
+++ b/llvm/unittests/IR/PatternMatch.cpp
@@ -2657,4 +2657,31 @@ TEST_F(PatternMatchTest, ShiftOrSelf) {
EXPECT_EQ(ShAmtC, 0U);
}
+TEST_F(PatternMatchTest, CommutativeDeferredIntrinsicMatch) {
+ Value *X = ConstantFP::get(IRB.getDoubleTy(), 1.0);
+ Value *Y = ConstantFP::get(IRB.getDoubleTy(), 2.0);
+
+ auto CheckMatch = [X, Y](Value *Pattern) {
+ Value *tX = nullptr, *tY = nullptr;
+ EXPECT_TRUE(
+ match(Pattern, m_c_Intrinsic<Intrinsic::minimum>(
+ m_Value(tX), m_c_Intrinsic<Intrinsic::minimum>(
+ m_Deferred(tX), m_Value(tY)))));
+ EXPECT_EQ(tX, X);
+ EXPECT_EQ(tY, Y);
+ };
+ CheckMatch(IRB.CreateBinaryIntrinsic(
+ Intrinsic::minimum, X,
+ IRB.CreateBinaryIntrinsic(Intrinsic::minimum, X, Y)));
+ CheckMatch(IRB.CreateBinaryIntrinsic(
+ Intrinsic::minimum, X,
+ IRB.CreateBinaryIntrinsic(Intrinsic::minimum, Y, X)));
+ CheckMatch(IRB.CreateBinaryIntrinsic(
+ Intrinsic::minimum, IRB.CreateBinaryIntrinsic(Intrinsic::minimum, X, Y),
+ X));
+ CheckMatch(IRB.CreateBinaryIntrinsic(
+ Intrinsic::minimum, IRB.CreateBinaryIntrinsic(Intrinsic::minimum, Y, X),
+ X));
+}
+
} // anonymous namespace.
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index c1791dfa..82ecc16 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -704,6 +704,20 @@ TEST_F(VPBasicBlockTest, reassociateBlocks) {
}
}
+TEST_F(VPBasicBlockTest, splitAtEnd) {
+ VPlan &Plan = getPlan();
+ VPInstruction *VPI = new VPInstruction(0, {});
+ VPBasicBlock *VPBB = Plan.createVPBasicBlock("VPBB1", VPI);
+ VPBlockUtils::connectBlocks(Plan.getEntry(), VPBB);
+ VPBlockUtils::connectBlocks(VPBB, Plan.getScalarHeader());
+ VPBB->splitAt(VPBB->end());
+ EXPECT_EQ(VPBB->size(), 1u);
+ EXPECT_EQ(&VPBB->front(), VPI);
+ auto *Split = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
+ EXPECT_TRUE(Split->empty());
+ EXPECT_EQ(Split->getSingleSuccessor(), Plan.getScalarHeader());
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
TEST_F(VPBasicBlockTest, print) {
VPInstruction *TC = new VPInstruction(Instruction::PHI, {});
@@ -955,16 +969,40 @@ compound=true
#endif
using VPRecipeTest = VPlanTestBase;
+
+namespace {
+template <typename RecipeT, typename T, typename... Rest>
+void checkVPRecipeCastImpl(RecipeT *R) {
+ // Direct checks on recipe pointer
+ EXPECT_TRUE(isa<T>(R));
+ EXPECT_EQ(R, dyn_cast<T>(R));
+ (void)cast<T>(R); // Verify cast succeeds (asserts on failure)
+
+ // Check through base pointer
+ VPRecipeBase *BaseR = R;
+ EXPECT_TRUE(isa<T>(BaseR));
+ EXPECT_EQ(R, dyn_cast<T>(BaseR));
+ (void)cast<T>(BaseR);
+
+ // Check through const base pointer
+ const VPRecipeBase *ConstBaseR = R;
+ EXPECT_TRUE(isa<T>(ConstBaseR));
+ EXPECT_EQ(R, dyn_cast<T>(ConstBaseR));
+ (void)cast<T>(ConstBaseR);
+
+ if constexpr (sizeof...(Rest) > 0)
+ checkVPRecipeCastImpl<RecipeT, Rest...>(R);
+}
+} // namespace
+
TEST_F(VPRecipeTest, CastVPInstructionToVPUser) {
IntegerType *Int32 = IntegerType::get(C, 32);
VPlan &Plan = getPlan();
VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
VPInstruction Recipe(Instruction::Add, {Op1, Op2});
- EXPECT_TRUE(isa<VPUser>(&Recipe));
- VPRecipeBase *BaseR = &Recipe;
- EXPECT_TRUE(isa<VPUser>(BaseR));
- EXPECT_EQ(&Recipe, BaseR);
+
+ checkVPRecipeCastImpl<VPInstruction, VPUser>(&Recipe);
}
TEST_F(VPRecipeTest, CastVPWidenRecipeToVPUser) {
@@ -978,10 +1016,8 @@ TEST_F(VPRecipeTest, CastVPWidenRecipeToVPUser) {
Args.push_back(Op1);
Args.push_back(Op2);
VPWidenRecipe WidenR(*AI, make_range(Args.begin(), Args.end()));
- EXPECT_TRUE(isa<VPUser>(&WidenR));
- VPRecipeBase *WidenRBase = &WidenR;
- EXPECT_TRUE(isa<VPUser>(WidenRBase));
- EXPECT_EQ(&WidenR, WidenRBase);
+
+ checkVPRecipeCastImpl<VPWidenRecipe, VPUser>(&WidenR);
delete AI;
}
@@ -999,10 +1035,8 @@ TEST_F(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) {
Args.push_back(Op2);
Args.push_back(CalledFn);
VPWidenCallRecipe Recipe(Call, Fn, Args);
- EXPECT_TRUE(isa<VPUser>(&Recipe));
- VPRecipeBase *BaseR = &Recipe;
- EXPECT_TRUE(isa<VPUser>(BaseR));
- EXPECT_EQ(&Recipe, BaseR);
+
+ checkVPRecipeCastImpl<VPWidenCallRecipe, VPUser>(&Recipe);
VPValue *VPV = &Recipe;
EXPECT_TRUE(VPV->getDefiningRecipe());
@@ -1027,13 +1061,10 @@ TEST_F(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) {
Args.push_back(Op3);
VPWidenSelectRecipe WidenSelectR(*SelectI,
make_range(Args.begin(), Args.end()));
- EXPECT_TRUE(isa<VPUser>(&WidenSelectR));
- VPRecipeBase *BaseR = &WidenSelectR;
- EXPECT_TRUE(isa<VPUser>(BaseR));
- EXPECT_EQ(&WidenSelectR, BaseR);
+
+ checkVPRecipeCastImpl<VPWidenSelectRecipe, VPUser>(&WidenSelectR);
VPValue *VPV = &WidenSelectR;
- EXPECT_TRUE(isa<VPRecipeBase>(VPV->getDefiningRecipe()));
EXPECT_EQ(&WidenSelectR, VPV->getDefiningRecipe());
delete SelectI;
@@ -1051,10 +1082,8 @@ TEST_F(VPRecipeTest, CastVPWidenGEPRecipeToVPUserAndVPDef) {
Args.push_back(Op1);
Args.push_back(Op2);
VPWidenGEPRecipe Recipe(GEP, make_range(Args.begin(), Args.end()));
- EXPECT_TRUE(isa<VPUser>(&Recipe));
- VPRecipeBase *BaseR = &Recipe;
- EXPECT_TRUE(isa<VPUser>(BaseR));
- EXPECT_EQ(&Recipe, BaseR);
+
+ checkVPRecipeCastImpl<VPWidenGEPRecipe, VPUser>(&Recipe);
VPValue *VPV = &Recipe;
EXPECT_TRUE(isa<VPRecipeBase>(VPV->getDefiningRecipe()));
@@ -1063,6 +1092,28 @@ TEST_F(VPRecipeTest, CastVPWidenGEPRecipeToVPUserAndVPDef) {
delete GEP;
}
+TEST_F(VPRecipeTest, CastVPWidenCastRecipeToVPUser) {
+ VPlan &Plan = getPlan();
+ IntegerType *Int32 = IntegerType::get(C, 32);
+ IntegerType *Int64 = IntegerType::get(C, 64);
+ auto *Cast = CastInst::CreateZExtOrBitCast(PoisonValue::get(Int32), Int64);
+ VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+ VPWidenCastRecipe Recipe(Instruction::ZExt, Op1, Int64, *Cast);
+
+ checkVPRecipeCastImpl<VPWidenCastRecipe, VPUser>(&Recipe);
+ delete Cast;
+}
+
+TEST_F(VPRecipeTest, CastVPWidenIntrinsicRecipeToVPUser) {
+ VPlan &Plan = getPlan();
+ IntegerType *Int32 = IntegerType::get(C, 32);
+ VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+ VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+ VPWidenIntrinsicRecipe Recipe(Intrinsic::smax, {Op1, Op2}, Int32);
+
+ checkVPRecipeCastImpl<VPWidenIntrinsicRecipe, VPUser>(&Recipe);
+}
+
TEST_F(VPRecipeTest, CastVPBlendRecipeToVPUser) {
VPlan &Plan = getPlan();
IntegerType *Int32 = IntegerType::get(C, 32);
@@ -1076,9 +1127,9 @@ TEST_F(VPRecipeTest, CastVPBlendRecipeToVPUser) {
Args.push_back(I2);
Args.push_back(M2);
VPBlendRecipe Recipe(Phi, Args, {});
- EXPECT_TRUE(isa<VPUser>(&Recipe));
- VPRecipeBase *BaseR = &Recipe;
- EXPECT_TRUE(isa<VPUser>(BaseR));
+
+ checkVPRecipeCastImpl<VPBlendRecipe, VPUser>(&Recipe);
+
delete Phi;
}
@@ -1089,10 +1140,8 @@ TEST_F(VPRecipeTest, CastVPInterleaveRecipeToVPUser) {
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
InterleaveGroup<Instruction> IG(4, false, Align(4));
VPInterleaveRecipe Recipe(&IG, Addr, {}, Mask, false, {}, DebugLoc());
- EXPECT_TRUE(isa<VPUser>(&Recipe));
- VPRecipeBase *BaseR = &Recipe;
- EXPECT_TRUE(isa<VPUser>(BaseR));
- EXPECT_EQ(&Recipe, BaseR);
+
+ checkVPRecipeCastImpl<VPInterleaveRecipe, VPUser>(&Recipe);
}
TEST_F(VPRecipeTest, CastVPReplicateRecipeToVPUser) {
@@ -1107,9 +1156,9 @@ TEST_F(VPRecipeTest, CastVPReplicateRecipeToVPUser) {
FunctionType *FTy = FunctionType::get(Int32, false);
auto *Call = CallInst::Create(FTy, PoisonValue::get(FTy));
VPReplicateRecipe Recipe(Call, make_range(Args.begin(), Args.end()), true);
- EXPECT_TRUE(isa<VPUser>(&Recipe));
- VPRecipeBase *BaseR = &Recipe;
- EXPECT_TRUE(isa<VPUser>(BaseR));
+
+ checkVPRecipeCastImpl<VPReplicateRecipe, VPUser>(&Recipe);
+
delete Call;
}
@@ -1118,10 +1167,8 @@ TEST_F(VPRecipeTest, CastVPBranchOnMaskRecipeToVPUser) {
IntegerType *Int32 = IntegerType::get(C, 32);
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPBranchOnMaskRecipe Recipe(Mask, {});
- EXPECT_TRUE(isa<VPUser>(&Recipe));
- VPRecipeBase *BaseR = &Recipe;
- EXPECT_TRUE(isa<VPUser>(BaseR));
- EXPECT_EQ(&Recipe, BaseR);
+
+ checkVPRecipeCastImpl<VPBranchOnMaskRecipe, VPUser>(&Recipe);
}
TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
@@ -1133,10 +1180,8 @@ TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {}, {});
- EXPECT_TRUE(isa<VPUser>(&Recipe));
- VPRecipeBase *BaseR = &Recipe;
- EXPECT_TRUE(isa<VPUser>(BaseR));
- EXPECT_EQ(&Recipe, BaseR);
+
+ checkVPRecipeCastImpl<VPWidenLoadRecipe, VPUser>(&Recipe);
VPValue *VPV = Recipe.getVPSingleValue();
EXPECT_TRUE(isa<VPRecipeBase>(VPV->getDefiningRecipe()));
@@ -1145,6 +1190,71 @@ TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
delete Load;
}
+TEST_F(VPRecipeTest, CastVPInterleaveEVLRecipeToVPUser) {
+ VPlan &Plan = getPlan();
+ IntegerType *Int32 = IntegerType::get(C, 32);
+ VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+ VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+ VPValue *EVL = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 8));
+ InterleaveGroup<Instruction> IG(4, false, Align(4));
+ VPInterleaveRecipe BaseRecipe(&IG, Addr, {}, Mask, false, {}, DebugLoc());
+ VPInterleaveEVLRecipe Recipe(BaseRecipe, *EVL, Mask);
+
+ checkVPRecipeCastImpl<VPInterleaveEVLRecipe, VPUser>(&Recipe);
+}
+
+TEST_F(VPRecipeTest, CastVPWidenLoadEVLRecipeToVPUser) {
+ VPlan &Plan = getPlan();
+ IntegerType *Int32 = IntegerType::get(C, 32);
+ PointerType *Int32Ptr = PointerType::get(C, 0);
+ auto *Load =
+ new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
+ VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+ VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+ VPValue *EVL = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 8));
+ VPWidenLoadRecipe BaseLoad(*Load, Addr, Mask, true, false, {}, {});
+ VPWidenLoadEVLRecipe Recipe(BaseLoad, Addr, *EVL, Mask);
+
+ checkVPRecipeCastImpl<VPWidenLoadEVLRecipe, VPUser>(&Recipe);
+
+ delete Load;
+}
+
+TEST_F(VPRecipeTest, CastVPWidenStoreRecipeToVPUser) {
+ VPlan &Plan = getPlan();
+ IntegerType *Int32 = IntegerType::get(C, 32);
+ PointerType *Int32Ptr = PointerType::get(C, 0);
+ auto *Store = new StoreInst(PoisonValue::get(Int32),
+ PoisonValue::get(Int32Ptr), false, Align(1));
+ VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+ VPValue *StoredVal = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 42));
+ VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+ VPWidenStoreRecipe Recipe(*Store, Addr, StoredVal, Mask, true, false, {}, {});
+
+ checkVPRecipeCastImpl<VPWidenStoreRecipe, VPUser>(&Recipe);
+
+ delete Store;
+}
+
+TEST_F(VPRecipeTest, CastVPWidenStoreEVLRecipeToVPUser) {
+ VPlan &Plan = getPlan();
+ IntegerType *Int32 = IntegerType::get(C, 32);
+ PointerType *Int32Ptr = PointerType::get(C, 0);
+ auto *Store = new StoreInst(PoisonValue::get(Int32),
+ PoisonValue::get(Int32Ptr), false, Align(1));
+ VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+ VPValue *StoredVal = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 42));
+ VPValue *EVL = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 8));
+ VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+ VPWidenStoreRecipe BaseStore(*Store, Addr, StoredVal, Mask, true, false, {},
+ {});
+ VPWidenStoreEVLRecipe Recipe(BaseStore, Addr, *EVL, Mask);
+
+ checkVPRecipeCastImpl<VPWidenStoreEVLRecipe, VPUser>(&Recipe);
+
+ delete Store;
+}
+
TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
IntegerType *Int1 = IntegerType::get(C, 1);
IntegerType *Int32 = IntegerType::get(C, 32);
@@ -1592,9 +1702,7 @@ TEST_F(VPRecipeTest, CastVPReductionRecipeToVPUser) {
VPValue *CondOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 3));
VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), Add, ChainOp,
CondOp, VecOp, false);
- EXPECT_TRUE(isa<VPUser>(&Recipe));
- VPRecipeBase *BaseR = &Recipe;
- EXPECT_TRUE(isa<VPUser>(BaseR));
+ checkVPRecipeCastImpl<VPReductionRecipe, VPUser>(&Recipe);
delete Add;
}
@@ -1609,9 +1717,7 @@ TEST_F(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) {
CondOp, VecOp, false);
VPValue *EVL = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 0));
VPReductionEVLRecipe EVLRecipe(Recipe, *EVL, CondOp);
- EXPECT_TRUE(isa<VPUser>(&EVLRecipe));
- VPRecipeBase *BaseR = &EVLRecipe;
- EXPECT_TRUE(isa<VPUser>(BaseR));
+ checkVPRecipeCastImpl<VPReductionEVLRecipe, VPUser>(&EVLRecipe);
delete Add;
}
} // namespace
diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp
index 0b90f91..e725de1 100644
--- a/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -341,8 +341,6 @@ emitGetOperandIdxName(raw_ostream &OS,
void InstrInfoEmitter::emitOperandNameMappings(
raw_ostream &OS, const CodeGenTarget &Target,
ArrayRef<const CodeGenInstruction *> TargetInstructions) {
- StringRef Namespace = Target.getInstNamespace();
-
// Map of operand names to their ID.
MapVector<StringRef, unsigned> OperandNameToID;
@@ -383,38 +381,35 @@ void InstrInfoEmitter::emitOperandNameMappings(
const size_t NumOperandNames = OperandNameToID.size();
const unsigned MaxNumOperands = MaxOperandNo + 1;
- OS << "#ifdef GET_INSTRINFO_OPERAND_ENUM\n";
- OS << "#undef GET_INSTRINFO_OPERAND_ENUM\n";
- OS << "namespace llvm::" << Namespace << " {\n";
-
- assert(NumOperandNames <= UINT16_MAX &&
- "Too many operands for the operand index -> name table");
- StringRef EnumType = getMinimalTypeForRange(NumOperandNames);
- OS << "enum class OpName : " << EnumType << " {\n";
- for (const auto &[Op, I] : OperandNameToID)
- OS << " " << Op << " = " << I << ",\n";
- OS << " NUM_OPERAND_NAMES = " << NumOperandNames << ",\n";
- OS << "}; // enum class OpName\n\n";
-
- OS << "LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, OpName "
- "Name);\n";
- OS << "LLVM_READONLY OpName getOperandIdxName(uint16_t Opcode, int16_t "
- "Idx);\n";
- OS << "} // end namespace llvm::" << Namespace << '\n';
- OS << "#endif //GET_INSTRINFO_OPERAND_ENUM\n\n";
-
- OS << "#ifdef GET_INSTRINFO_NAMED_OPS\n";
- OS << "#undef GET_INSTRINFO_NAMED_OPS\n";
- OS << "namespace llvm::" << Namespace << " {\n";
-
- emitGetInstructionIndexForOpLookup(OS, OperandMap, InstructionIndex);
+ const SmallString<32> Namespace({"llvm::", Target.getInstNamespace()});
+ {
+ IfDefEmitter IfDef(OS, "GET_INSTRINFO_OPERAND_ENUM");
+ NamespaceEmitter NS(OS, Namespace);
+
+ assert(NumOperandNames <= UINT16_MAX &&
+ "Too many operands for the operand index -> name table");
+ StringRef EnumType = getMinimalTypeForRange(NumOperandNames);
+ OS << "enum class OpName : " << EnumType << " {\n";
+ for (const auto &[Op, I] : OperandNameToID)
+ OS << " " << Op << " = " << I << ",\n";
+ OS << " NUM_OPERAND_NAMES = " << NumOperandNames << ",\n";
+ OS << "}; // enum class OpName\n\n";
+
+ OS << "LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, OpName "
+ "Name);\n";
+ OS << "LLVM_READONLY OpName getOperandIdxName(uint16_t Opcode, int16_t "
+ "Idx);\n";
+ }
- emitGetNamedOperandIdx(OS, OperandMap, MaxOperandNo, NumOperandNames);
- emitGetOperandIdxName(OS, OperandNameToID, OperandMap, MaxNumOperands,
- NumOperandNames);
+ {
+ IfDefEmitter IfDef(OS, "GET_INSTRINFO_NAMED_OPS");
+ NamespaceEmitter NS(OS, Namespace);
+ emitGetInstructionIndexForOpLookup(OS, OperandMap, InstructionIndex);
- OS << "} // end namespace llvm::" << Namespace << '\n';
- OS << "#endif //GET_INSTRINFO_NAMED_OPS\n\n";
+ emitGetNamedOperandIdx(OS, OperandMap, MaxOperandNo, NumOperandNames);
+ emitGetOperandIdxName(OS, OperandNameToID, OperandMap, MaxNumOperands,
+ NumOperandNames);
+ }
}
/// Generate an enum for all the operand types for this target, under the
@@ -439,121 +434,121 @@ void InstrInfoEmitter::emitOperandTypeMappings(
ArrayRef<const Record *> RegisterClasses =
Records.getAllDerivedDefinitions("RegisterClass");
- OS << "#ifdef GET_INSTRINFO_OPERAND_TYPES_ENUM\n";
- OS << "#undef GET_INSTRINFO_OPERAND_TYPES_ENUM\n";
- OS << "namespace llvm::" << Namespace << "::OpTypes {\n";
- OS << "enum OperandType {\n";
-
unsigned EnumVal = 0;
- for (ArrayRef<const Record *> RecordsToAdd :
- {Operands, RegisterOperands, RegisterClasses}) {
- for (const Record *Op : RecordsToAdd) {
- if (!Op->isAnonymous())
- OS << " " << Op->getName() << " = " << EnumVal << ",\n";
- ++EnumVal;
+
+ {
+ IfDefEmitter IfDef(OS, "GET_INSTRINFO_OPERAND_TYPES_ENUM");
+ NamespaceEmitter NS(OS, ("llvm::" + Namespace + "::OpTypes").str());
+ OS << "enum OperandType {\n";
+
+ for (ArrayRef<const Record *> RecordsToAdd :
+ {Operands, RegisterOperands, RegisterClasses}) {
+ for (const Record *Op : RecordsToAdd) {
+ if (!Op->isAnonymous())
+ OS << " " << Op->getName() << " = " << EnumVal << ",\n";
+ ++EnumVal;
+ }
}
+
+ OS << " OPERAND_TYPE_LIST_END" << "\n};\n";
}
- OS << " OPERAND_TYPE_LIST_END"
- << "\n};\n";
- OS << "} // end namespace llvm::" << Namespace << "::OpTypes\n";
- OS << "#endif // GET_INSTRINFO_OPERAND_TYPES_ENUM\n\n";
-
- OS << "#ifdef GET_INSTRINFO_OPERAND_TYPE\n";
- OS << "#undef GET_INSTRINFO_OPERAND_TYPE\n";
- OS << "namespace llvm::" << Namespace << " {\n";
- OS << "LLVM_READONLY\n";
- OS << "static int getOperandType(uint16_t Opcode, uint16_t OpIdx) {\n";
- auto getInstrName = [&](int I) -> StringRef {
- return NumberedInstructions[I]->getName();
- };
- // TODO: Factor out duplicate operand lists to compress the tables.
- std::vector<size_t> OperandOffsets;
- std::vector<const Record *> OperandRecords;
- size_t CurrentOffset = 0;
- for (const CodeGenInstruction *Inst : NumberedInstructions) {
- OperandOffsets.push_back(CurrentOffset);
- for (const auto &Op : Inst->Operands) {
- const DagInit *MIOI = Op.MIOperandInfo;
- if (!ExpandMIOperandInfo || !MIOI || MIOI->getNumArgs() == 0) {
- // Single, anonymous, operand.
- OperandRecords.push_back(Op.Rec);
- ++CurrentOffset;
- } else {
- for (const Init *Arg : MIOI->getArgs()) {
- OperandRecords.push_back(cast<DefInit>(Arg)->getDef());
+ {
+ IfDefEmitter IfDef(OS, "GET_INSTRINFO_OPERAND_TYPE");
+ NamespaceEmitter NS(OS, ("llvm::" + Namespace).str());
+ OS << "LLVM_READONLY\n";
+ OS << "static int getOperandType(uint16_t Opcode, uint16_t OpIdx) {\n";
+ auto getInstrName = [&](int I) -> StringRef {
+ return NumberedInstructions[I]->getName();
+ };
+ // TODO: Factor out duplicate operand lists to compress the tables.
+ std::vector<size_t> OperandOffsets;
+ std::vector<const Record *> OperandRecords;
+ size_t CurrentOffset = 0;
+ for (const CodeGenInstruction *Inst : NumberedInstructions) {
+ OperandOffsets.push_back(CurrentOffset);
+ for (const auto &Op : Inst->Operands) {
+ const DagInit *MIOI = Op.MIOperandInfo;
+ if (!ExpandMIOperandInfo || !MIOI || MIOI->getNumArgs() == 0) {
+ // Single, anonymous, operand.
+ OperandRecords.push_back(Op.Rec);
++CurrentOffset;
+ } else {
+ for (const Init *Arg : MIOI->getArgs()) {
+ OperandRecords.push_back(cast<DefInit>(Arg)->getDef());
+ ++CurrentOffset;
+ }
}
}
}
- }
- // Emit the table of offsets (indexes) into the operand type table.
- // Size the unsigned integer offset to save space.
- assert(OperandRecords.size() <= UINT32_MAX &&
- "Too many operands for offset table");
- OS << " static constexpr " << getMinimalTypeForRange(OperandRecords.size());
- OS << " Offsets[] = {\n";
- for (const auto &[Idx, Offset] : enumerate(OperandOffsets))
- OS << " " << Offset << ", // " << getInstrName(Idx) << '\n';
- OS << " };\n";
+ // Emit the table of offsets (indexes) into the operand type table.
+ // Size the unsigned integer offset to save space.
+ assert(OperandRecords.size() <= UINT32_MAX &&
+ "Too many operands for offset table");
+ OS << " static constexpr "
+ << getMinimalTypeForRange(OperandRecords.size());
+ OS << " Offsets[] = {\n";
+ for (const auto &[Idx, Offset] : enumerate(OperandOffsets))
+ OS << " " << Offset << ", // " << getInstrName(Idx) << '\n';
+ OS << " };\n";
- // Add an entry for the end so that we don't need to special case it below.
- OperandOffsets.push_back(OperandRecords.size());
-
- // Emit the actual operand types in a flat table.
- // Size the signed integer operand type to save space.
- assert(EnumVal <= INT16_MAX &&
- "Too many operand types for operand types table");
- OS << "\n using namespace OpTypes;\n";
- OS << " static";
- OS << (EnumVal <= INT8_MAX ? " constexpr int8_t" : " constexpr int16_t");
- OS << " OpcodeOperandTypes[] = {";
- size_t CurOffset = 0;
- for (auto [Idx, OpR] : enumerate(OperandRecords)) {
- // We print each Opcode's operands in its own row.
- if (Idx == OperandOffsets[CurOffset]) {
- OS << "\n /* " << getInstrName(CurOffset) << " */\n ";
- while (OperandOffsets[++CurOffset] == Idx)
- OS << "/* " << getInstrName(CurOffset) << " */\n ";
+ // Add an entry for the end so that we don't need to special case it below.
+ OperandOffsets.push_back(OperandRecords.size());
+
+ // Emit the actual operand types in a flat table.
+ // Size the signed integer operand type to save space.
+ assert(EnumVal <= INT16_MAX &&
+ "Too many operand types for operand types table");
+ OS << "\n using namespace OpTypes;\n";
+ OS << " static";
+ OS << (EnumVal <= INT8_MAX ? " constexpr int8_t" : " constexpr int16_t");
+ OS << " OpcodeOperandTypes[] = {";
+ size_t CurOffset = 0;
+ for (auto [Idx, OpR] : enumerate(OperandRecords)) {
+ // We print each Opcode's operands in its own row.
+ if (Idx == OperandOffsets[CurOffset]) {
+ OS << "\n /* " << getInstrName(CurOffset) << " */\n ";
+ while (OperandOffsets[++CurOffset] == Idx)
+ OS << "/* " << getInstrName(CurOffset) << " */\n ";
+ }
+ if ((OpR->isSubClassOf("Operand") ||
+ OpR->isSubClassOf("RegisterOperand") ||
+ OpR->isSubClassOf("RegisterClass")) &&
+ !OpR->isAnonymous())
+ OS << OpR->getName();
+ else
+ OS << -1;
+ OS << ", ";
}
- if ((OpR->isSubClassOf("Operand") || OpR->isSubClassOf("RegisterOperand") ||
- OpR->isSubClassOf("RegisterClass")) &&
- !OpR->isAnonymous())
- OS << OpR->getName();
- else
- OS << -1;
- OS << ", ";
- }
- OS << "\n };\n";
+ OS << "\n };\n";
- OS << " return OpcodeOperandTypes[Offsets[Opcode] + OpIdx];\n";
- OS << "}\n";
- OS << "} // end namespace llvm::" << Namespace << '\n';
- OS << "#endif // GET_INSTRINFO_OPERAND_TYPE\n\n";
-
- OS << "#ifdef GET_INSTRINFO_MEM_OPERAND_SIZE\n";
- OS << "#undef GET_INSTRINFO_MEM_OPERAND_SIZE\n";
- OS << "namespace llvm::" << Namespace << " {\n";
- OS << "LLVM_READONLY\n";
- OS << "static int getMemOperandSize(int OpType) {\n";
- OS << " switch (OpType) {\n";
- std::map<int, SmallVector<StringRef, 0>> SizeToOperandName;
- for (const Record *Op : Operands) {
- if (!Op->isSubClassOf("X86MemOperand"))
- continue;
- if (int Size = Op->getValueAsInt("Size"))
- SizeToOperandName[Size].push_back(Op->getName());
+ OS << " return OpcodeOperandTypes[Offsets[Opcode] + OpIdx];\n";
+ OS << "}\n";
}
- OS << " default: return 0;\n";
- for (const auto &[Size, OperandNames] : SizeToOperandName) {
- for (const StringRef &OperandName : OperandNames)
- OS << " case OpTypes::" << OperandName << ":\n";
- OS << " return " << Size << ";\n\n";
+
+ {
+ IfDefEmitter IfDef(OS, "GET_INSTRINFO_MEM_OPERAND_SIZE");
+ NamespaceEmitter NS(OS, ("llvm::" + Namespace).str());
+
+ OS << "LLVM_READONLY\n";
+ OS << "static int getMemOperandSize(int OpType) {\n";
+ OS << " switch (OpType) {\n";
+ std::map<int, SmallVector<StringRef, 0>> SizeToOperandName;
+ for (const Record *Op : Operands) {
+ if (!Op->isSubClassOf("X86MemOperand"))
+ continue;
+ if (int Size = Op->getValueAsInt("Size"))
+ SizeToOperandName[Size].push_back(Op->getName());
+ }
+ OS << " default: return 0;\n";
+ for (const auto &[Size, OperandNames] : SizeToOperandName) {
+ for (const StringRef &OperandName : OperandNames)
+ OS << " case OpTypes::" << OperandName << ":\n";
+ OS << " return " << Size << ";\n\n";
+ }
+ OS << " }\n}\n";
}
- OS << " }\n}\n";
- OS << "} // end namespace llvm::" << Namespace << '\n';
- OS << "#endif // GET_INSTRINFO_MEM_OPERAND_SIZE\n\n";
}
// Fixed/Predefined instructions do not have UseLogicalOperandMappings
@@ -587,9 +582,8 @@ void InstrInfoEmitter::emitLogicalOperandSizeMappings(
InstMap[I->second].push_back((Namespace + "::" + Inst->getName()).str());
}
- OS << "#ifdef GET_INSTRINFO_LOGICAL_OPERAND_SIZE_MAP\n";
- OS << "#undef GET_INSTRINFO_LOGICAL_OPERAND_SIZE_MAP\n";
- OS << "namespace llvm::" << Namespace << " {\n";
+ IfDefEmitter IfDef(OS, "GET_INSTRINFO_LOGICAL_OPERAND_SIZE_MAP");
+ NamespaceEmitter NS(OS, ("llvm::" + Namespace).str());
OS << "LLVM_READONLY static unsigned\n";
OS << "getLogicalOperandSize(uint16_t Opcode, uint16_t LogicalOpIdx) {\n";
if (!InstMap.empty()) {
@@ -637,9 +631,6 @@ void InstrInfoEmitter::emitLogicalOperandSizeMappings(
OS << " S += getLogicalOperandSize(Opcode, i);\n";
OS << " return S;\n";
OS << "}\n";
-
- OS << "} // end namespace llvm::" << Namespace << '\n';
- OS << "#endif // GET_INSTRINFO_LOGICAL_OPERAND_SIZE_MAP\n\n";
}
void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS,
@@ -647,48 +638,38 @@ void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS,
ArrayRef<const Record *> TIIPredicates =
Records.getAllDerivedDefinitions("TIIPredicate");
- OS << "#ifdef GET_INSTRINFO_MC_HELPER_DECLS\n";
- OS << "#undef GET_INSTRINFO_MC_HELPER_DECLS\n\n";
-
- OS << "namespace llvm {\n";
- OS << "class MCInst;\n";
- OS << "class FeatureBitset;\n\n";
+ {
+ IfDefEmitter IfDef(OS, "GET_INSTRINFO_MC_HELPER_DECLS");
+ NamespaceEmitter LlvmNS(OS, "llvm");
+ OS << "class MCInst;\n";
+ OS << "class FeatureBitset;\n\n";
- OS << "namespace " << TargetName << "_MC {\n\n";
+ NamespaceEmitter TargetNS(OS, (TargetName + "_MC").str());
+ for (const Record *Rec : TIIPredicates)
+ OS << "bool " << Rec->getValueAsString("FunctionName")
+ << "(const MCInst &MI);\n";
- for (const Record *Rec : TIIPredicates) {
- OS << "bool " << Rec->getValueAsString("FunctionName")
- << "(const MCInst &MI);\n";
+ OS << "void verifyInstructionPredicates(unsigned Opcode, const "
+ "FeatureBitset "
+ "&Features);\n";
}
- OS << "void verifyInstructionPredicates(unsigned Opcode, const FeatureBitset "
- "&Features);\n";
-
- OS << "\n} // end namespace " << TargetName << "_MC\n";
- OS << "} // end namespace llvm\n\n";
-
- OS << "#endif // GET_INSTRINFO_MC_HELPER_DECLS\n\n";
-
- OS << "#ifdef GET_INSTRINFO_MC_HELPERS\n";
- OS << "#undef GET_INSTRINFO_MC_HELPERS\n\n";
-
- OS << "namespace llvm::" << TargetName << "_MC {\n";
+ {
+ IfDefEmitter IfDef(OS, "GET_INSTRINFO_MC_HELPERS");
+ NamespaceEmitter NS(OS, ("llvm::" + TargetName + "_MC").str());
- PredicateExpander PE(TargetName);
- PE.setExpandForMC(true);
+ PredicateExpander PE(TargetName);
+ PE.setExpandForMC(true);
- for (const Record *Rec : TIIPredicates) {
- OS << "bool " << Rec->getValueAsString("FunctionName");
- OS << "(const MCInst &MI) {\n";
+ for (const Record *Rec : TIIPredicates) {
+ OS << "bool " << Rec->getValueAsString("FunctionName");
+ OS << "(const MCInst &MI) {\n";
- OS << PE.getIndent();
- PE.expandStatement(OS, Rec->getValueAsDef("Body"));
- OS << "\n}\n\n";
+ OS << PE.getIndent();
+ PE.expandStatement(OS, Rec->getValueAsDef("Body"));
+ OS << "\n}\n\n";
+ }
}
-
- OS << "} // end namespace llvm::" << TargetName << "_MC\n";
-
- OS << "#endif // GET_GENISTRINFO_MC_HELPERS\n\n";
}
static std::string
@@ -710,148 +691,143 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS,
<< " defined(GET_AVAILABLE_OPCODE_CHECKER)\n"
<< "#define GET_COMPUTE_FEATURES\n"
<< "#endif\n";
- OS << "#ifdef GET_COMPUTE_FEATURES\n"
- << "#undef GET_COMPUTE_FEATURES\n"
- << "namespace llvm::" << Target.getName() << "_MC {\n";
-
- // Emit the subtarget feature enumeration.
- SubtargetFeatureInfo::emitSubtargetFeatureBitEnumeration(SubtargetFeatures,
- OS);
- // Emit the available features compute function.
- OS << "inline ";
- SubtargetFeatureInfo::emitComputeAssemblerAvailableFeatures(
- Target.getName(), "", "computeAvailableFeatures", SubtargetFeatures, OS);
-
- std::vector<std::vector<const Record *>> FeatureBitsets;
- for (const CodeGenInstruction *Inst : Target.getInstructions()) {
- FeatureBitsets.emplace_back();
- for (const Record *Predicate :
- Inst->TheDef->getValueAsListOfDefs("Predicates")) {
- const auto &I = SubtargetFeatures.find(Predicate);
- if (I != SubtargetFeatures.end())
- FeatureBitsets.back().push_back(I->second.TheDef);
+ std::string Namespace = ("llvm::" + Target.getName() + "_MC").str();
+ {
+ IfDefEmitter IfDef(OS, "GET_COMPUTE_FEATURES");
+ NamespaceEmitter NS(OS, Namespace);
+
+ // Emit the subtarget feature enumeration.
+ SubtargetFeatureInfo::emitSubtargetFeatureBitEnumeration(SubtargetFeatures,
+ OS);
+ // Emit the available features compute function.
+ OS << "inline ";
+ SubtargetFeatureInfo::emitComputeAssemblerAvailableFeatures(
+ Target.getName(), "", "computeAvailableFeatures", SubtargetFeatures,
+ OS);
+
+ std::vector<std::vector<const Record *>> FeatureBitsets;
+ for (const CodeGenInstruction *Inst : Target.getInstructions()) {
+ FeatureBitsets.emplace_back();
+ for (const Record *Predicate :
+ Inst->TheDef->getValueAsListOfDefs("Predicates")) {
+ const auto &I = SubtargetFeatures.find(Predicate);
+ if (I != SubtargetFeatures.end())
+ FeatureBitsets.back().push_back(I->second.TheDef);
+ }
}
- }
- llvm::sort(FeatureBitsets, [&](ArrayRef<const Record *> A,
- ArrayRef<const Record *> B) {
- if (A.size() < B.size())
- return true;
- if (A.size() > B.size())
- return false;
- for (auto Pair : zip(A, B)) {
- if (std::get<0>(Pair)->getName() < std::get<1>(Pair)->getName())
+ llvm::sort(FeatureBitsets, [&](ArrayRef<const Record *> A,
+ ArrayRef<const Record *> B) {
+ if (A.size() < B.size())
return true;
- if (std::get<0>(Pair)->getName() > std::get<1>(Pair)->getName())
+ if (A.size() > B.size())
return false;
+ for (auto Pair : zip(A, B)) {
+ if (std::get<0>(Pair)->getName() < std::get<1>(Pair)->getName())
+ return true;
+ if (std::get<0>(Pair)->getName() > std::get<1>(Pair)->getName())
+ return false;
+ }
+ return false;
+ });
+ FeatureBitsets.erase(llvm::unique(FeatureBitsets), FeatureBitsets.end());
+ OS << "inline FeatureBitset computeRequiredFeatures(unsigned Opcode) {\n"
+ << " enum : " << getMinimalTypeForRange(FeatureBitsets.size()) << " {\n"
+ << " CEFBS_None,\n";
+ for (const auto &FeatureBitset : FeatureBitsets) {
+ if (FeatureBitset.empty())
+ continue;
+ OS << " " << getNameForFeatureBitset(FeatureBitset) << ",\n";
}
- return false;
- });
- FeatureBitsets.erase(llvm::unique(FeatureBitsets), FeatureBitsets.end());
- OS << "inline FeatureBitset computeRequiredFeatures(unsigned Opcode) {\n"
- << " enum : " << getMinimalTypeForRange(FeatureBitsets.size()) << " {\n"
- << " CEFBS_None,\n";
- for (const auto &FeatureBitset : FeatureBitsets) {
- if (FeatureBitset.empty())
- continue;
- OS << " " << getNameForFeatureBitset(FeatureBitset) << ",\n";
- }
- OS << " };\n\n"
- << " static constexpr FeatureBitset FeatureBitsets[] = {\n"
- << " {}, // CEFBS_None\n";
- for (const auto &FeatureBitset : FeatureBitsets) {
- if (FeatureBitset.empty())
- continue;
- OS << " {";
- for (const auto &Feature : FeatureBitset) {
- const auto &I = SubtargetFeatures.find(Feature);
- assert(I != SubtargetFeatures.end() && "Didn't import predicate?");
- OS << I->second.getEnumBitName() << ", ";
+ OS << " };\n\n"
+ << " static constexpr FeatureBitset FeatureBitsets[] = {\n"
+ << " {}, // CEFBS_None\n";
+ for (const auto &FeatureBitset : FeatureBitsets) {
+ if (FeatureBitset.empty())
+ continue;
+ OS << " {";
+ for (const auto &Feature : FeatureBitset) {
+ const auto &I = SubtargetFeatures.find(Feature);
+ assert(I != SubtargetFeatures.end() && "Didn't import predicate?");
+ OS << I->second.getEnumBitName() << ", ";
+ }
+ OS << "},\n";
}
- OS << "},\n";
- }
- OS << " };\n"
- << " static constexpr " << getMinimalTypeForRange(FeatureBitsets.size())
- << " RequiredFeaturesRefs[] = {\n";
- ArrayRef<const CodeGenInstruction *> NumberedInstructions =
- Target.getInstructions();
- for (const CodeGenInstruction *Inst : NumberedInstructions) {
- OS << " CEFBS";
- unsigned NumPredicates = 0;
- for (const Record *Predicate :
- Inst->TheDef->getValueAsListOfDefs("Predicates")) {
- const auto &I = SubtargetFeatures.find(Predicate);
- if (I != SubtargetFeatures.end()) {
- OS << '_' << I->second.TheDef->getName();
- NumPredicates++;
+ OS << " };\n"
+ << " static constexpr " << getMinimalTypeForRange(FeatureBitsets.size())
+ << " RequiredFeaturesRefs[] = {\n";
+ ArrayRef<const CodeGenInstruction *> NumberedInstructions =
+ Target.getInstructions();
+ for (const CodeGenInstruction *Inst : NumberedInstructions) {
+ OS << " CEFBS";
+ unsigned NumPredicates = 0;
+ for (const Record *Predicate :
+ Inst->TheDef->getValueAsListOfDefs("Predicates")) {
+ const auto &I = SubtargetFeatures.find(Predicate);
+ if (I != SubtargetFeatures.end()) {
+ OS << '_' << I->second.TheDef->getName();
+ NumPredicates++;
+ }
}
+ if (!NumPredicates)
+ OS << "_None";
+ OS << ", // " << Inst->getName() << '\n';
}
- if (!NumPredicates)
- OS << "_None";
- OS << ", // " << Inst->getName() << '\n';
+ OS << " };\n\n"
+ << " assert(Opcode < " << NumberedInstructions.size() << ");\n"
+ << " return FeatureBitsets[RequiredFeaturesRefs[Opcode]];\n"
+ << "}\n\n";
+ } // end scope for GET_COMPUTE_FEATURES
+
+ {
+ IfDefEmitter IfDef(OS, "GET_AVAILABLE_OPCODE_CHECKER");
+ NamespaceEmitter NS(OS, Namespace);
+ OS << "bool isOpcodeAvailable("
+ << "unsigned Opcode, const FeatureBitset &Features) {\n"
+ << " FeatureBitset AvailableFeatures = "
+ << "computeAvailableFeatures(Features);\n"
+ << " FeatureBitset RequiredFeatures = "
+ << "computeRequiredFeatures(Opcode);\n"
+ << " FeatureBitset MissingFeatures =\n"
+ << " (AvailableFeatures & RequiredFeatures) ^\n"
+ << " RequiredFeatures;\n"
+ << " return !MissingFeatures.any();\n"
+ << "}\n";
+ }
+
+ {
+ IfDefEmitter IfDef(OS, "ENABLE_INSTR_PREDICATE_VERIFIER");
+ OS << "#include <sstream>\n\n";
+ NamespaceEmitter NS(OS, Namespace);
+ // Emit the name table for error messages.
+ OS << "#ifndef NDEBUG\n";
+ SubtargetFeatureInfo::emitNameTable(SubtargetFeatures, OS);
+ OS << "#endif // NDEBUG\n\n";
+ // Emit the predicate verifier.
+ OS << "void verifyInstructionPredicates(\n"
+ << " unsigned Opcode, const FeatureBitset &Features) {\n"
+ << "#ifndef NDEBUG\n";
+ OS << " FeatureBitset AvailableFeatures = "
+ "computeAvailableFeatures(Features);\n";
+ OS << " FeatureBitset RequiredFeatures = "
+ << "computeRequiredFeatures(Opcode);\n";
+ OS << " FeatureBitset MissingFeatures =\n"
+ << " (AvailableFeatures & RequiredFeatures) ^\n"
+ << " RequiredFeatures;\n"
+ << " if (MissingFeatures.any()) {\n"
+ << " std::ostringstream Msg;\n"
+ << " Msg << \"Attempting to emit \" << &" << Target.getName()
+ << "InstrNameData[" << Target.getName() << "InstrNameIndices[Opcode]]\n"
+ << " << \" instruction but the \";\n"
+ << " for (unsigned i = 0, e = MissingFeatures.size(); i != e; ++i)\n"
+ << " if (MissingFeatures.test(i))\n"
+ << " Msg << SubtargetFeatureNames[i] << \" \";\n"
+ << " Msg << \"predicate(s) are not met\";\n"
+ << " report_fatal_error(Msg.str().c_str());\n"
+ << " }\n"
+ << "#endif // NDEBUG\n";
+ OS << "}\n";
}
- OS << " };\n\n"
- << " assert(Opcode < " << NumberedInstructions.size() << ");\n"
- << " return FeatureBitsets[RequiredFeaturesRefs[Opcode]];\n"
- << "}\n\n";
-
- OS << "} // end namespace llvm::" << Target.getName() << "_MC\n"
- << "#endif // GET_COMPUTE_FEATURES\n\n";
-
- OS << "#ifdef GET_AVAILABLE_OPCODE_CHECKER\n"
- << "#undef GET_AVAILABLE_OPCODE_CHECKER\n"
- << "namespace llvm::" << Target.getName() << "_MC {\n";
- OS << "bool isOpcodeAvailable("
- << "unsigned Opcode, const FeatureBitset &Features) {\n"
- << " FeatureBitset AvailableFeatures = "
- << "computeAvailableFeatures(Features);\n"
- << " FeatureBitset RequiredFeatures = "
- << "computeRequiredFeatures(Opcode);\n"
- << " FeatureBitset MissingFeatures =\n"
- << " (AvailableFeatures & RequiredFeatures) ^\n"
- << " RequiredFeatures;\n"
- << " return !MissingFeatures.any();\n"
- << "}\n";
- OS << "} // end namespace llvm::" << Target.getName() << "_MC\n"
- << "#endif // GET_AVAILABLE_OPCODE_CHECKER\n\n";
-
- OS << "#ifdef ENABLE_INSTR_PREDICATE_VERIFIER\n"
- << "#undef ENABLE_INSTR_PREDICATE_VERIFIER\n"
- << "#include <sstream>\n\n";
-
- OS << "namespace llvm::" << Target.getName() << "_MC {\n";
-
- // Emit the name table for error messages.
- OS << "#ifndef NDEBUG\n";
- SubtargetFeatureInfo::emitNameTable(SubtargetFeatures, OS);
- OS << "#endif // NDEBUG\n\n";
-
- // Emit the predicate verifier.
- OS << "void verifyInstructionPredicates(\n"
- << " unsigned Opcode, const FeatureBitset &Features) {\n"
- << "#ifndef NDEBUG\n";
- OS << " FeatureBitset AvailableFeatures = "
- "computeAvailableFeatures(Features);\n";
- OS << " FeatureBitset RequiredFeatures = "
- << "computeRequiredFeatures(Opcode);\n";
- OS << " FeatureBitset MissingFeatures =\n"
- << " (AvailableFeatures & RequiredFeatures) ^\n"
- << " RequiredFeatures;\n"
- << " if (MissingFeatures.any()) {\n"
- << " std::ostringstream Msg;\n"
- << " Msg << \"Attempting to emit \" << &" << Target.getName()
- << "InstrNameData[" << Target.getName() << "InstrNameIndices[Opcode]]\n"
- << " << \" instruction but the \";\n"
- << " for (unsigned i = 0, e = MissingFeatures.size(); i != e; ++i)\n"
- << " if (MissingFeatures.test(i))\n"
- << " Msg << SubtargetFeatureNames[i] << \" \";\n"
- << " Msg << \"predicate(s) are not met\";\n"
- << " report_fatal_error(Msg.str().c_str());\n"
- << " }\n"
- << "#endif // NDEBUG\n";
- OS << "}\n";
- OS << "} // end namespace llvm::" << Target.getName() << "_MC\n";
- OS << "#endif // ENABLE_INSTR_PREDICATE_VERIFIER\n\n";
}
void InstrInfoEmitter::emitTIIHelperMethods(raw_ostream &OS,
@@ -954,270 +930,263 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
OS << "#endif // defined(GET_INSTRINFO_MC_DESC) || "
"defined(GET_INSTRINFO_CTOR_DTOR)\n\n";
- OS << "#ifdef GET_INSTRINFO_MC_DESC\n";
- OS << "#undef GET_INSTRINFO_MC_DESC\n";
- OS << "namespace llvm {\n\n";
-
- // Emit all of the MCInstrDesc records in reverse ENUM ordering.
- Timer.startTimer("Emit InstrDesc records");
- OS << "static_assert(sizeof(MCOperandInfo) % sizeof(MCPhysReg) == 0);\n";
- OS << "static constexpr unsigned " << TargetName << "ImpOpBase = sizeof "
- << TargetName << "InstrTable::OperandInfo / (sizeof(MCPhysReg));\n\n";
-
- OS << "extern const " << TargetName << "InstrTable " << TargetName
- << "Descs = {\n {\n";
- SequenceToOffsetTable<StringRef> InstrNames;
- unsigned Num = NumberedInstructions.size();
- for (const CodeGenInstruction *Inst : reverse(NumberedInstructions)) {
- // Keep a list of the instruction names.
- InstrNames.add(Inst->getName());
- // Emit the record into the table.
- emitRecord(*Inst, --Num, InstrInfo, EmittedLists, OperandInfoMap, OS);
- }
-
- OS << " }, {\n";
-
- // Emit all of the operand info records.
- Timer.startTimer("Emit operand info");
- EmitOperandInfo(OS, OperandInfoList);
-
- OS << " }, {\n";
-
- // Emit all of the instruction's implicit uses and defs.
- Timer.startTimer("Emit uses/defs");
- for (auto &List : ImplicitLists) {
- OS << " /* " << EmittedLists[List] << " */";
- for (auto &Reg : List)
- OS << ' ' << getQualifiedName(Reg) << ',';
- OS << '\n';
- }
-
- OS << " }\n};\n\n";
-
- // Emit the array of instruction names.
- Timer.startTimer("Emit instruction names");
- InstrNames.layout();
- InstrNames.emitStringLiteralDef(OS, Twine("extern const char ") + TargetName +
- "InstrNameData[]");
const CodeGenRegBank &RegBank = Target.getRegBank();
const CodeGenHwModes &CGH = Target.getHwModes();
unsigned NumModes = CGH.getNumModeIds();
ArrayRef<const Record *> RegClassByHwMode = Target.getAllRegClassByHwMode();
unsigned NumClassesByHwMode = RegClassByHwMode.size();
- OS << "extern const unsigned " << TargetName << "InstrNameIndices[] = {";
- Num = 0;
- for (const CodeGenInstruction *Inst : NumberedInstructions) {
- // Newline every eight entries.
- if (Num % 8 == 0)
- OS << "\n ";
- OS << InstrNames.get(Inst->getName()) << "U, ";
- ++Num;
- }
- OS << "\n};\n\n";
-
bool HasDeprecationFeatures =
llvm::any_of(NumberedInstructions, [](const CodeGenInstruction *Inst) {
return !Inst->HasComplexDeprecationPredicate &&
!Inst->DeprecatedReason.empty();
});
- if (HasDeprecationFeatures) {
- OS << "extern const uint8_t " << TargetName
- << "InstrDeprecationFeatures[] = {";
- Num = 0;
- for (const CodeGenInstruction *Inst : NumberedInstructions) {
- if (Num % 8 == 0)
- OS << "\n ";
- if (!Inst->HasComplexDeprecationPredicate &&
- !Inst->DeprecatedReason.empty())
- OS << Target.getInstNamespace() << "::" << Inst->DeprecatedReason
- << ", ";
- else
- OS << "uint8_t(-1), ";
- ++Num;
- }
- OS << "\n};\n\n";
- }
-
bool HasComplexDeprecationInfos =
llvm::any_of(NumberedInstructions, [](const CodeGenInstruction *Inst) {
return Inst->HasComplexDeprecationPredicate;
});
- if (HasComplexDeprecationInfos) {
- OS << "extern const MCInstrInfo::ComplexDeprecationPredicate " << TargetName
- << "InstrComplexDeprecationInfos[] = {";
+
+ {
+ IfDefEmitter IfDef(OS, "GET_INSTRINFO_MC_DESC");
+ NamespaceEmitter LlvmNS(OS, "llvm");
+
+ // Emit all of the MCInstrDesc records in reverse ENUM ordering.
+ Timer.startTimer("Emit InstrDesc records");
+ OS << "static_assert(sizeof(MCOperandInfo) % sizeof(MCPhysReg) == 0);\n";
+ OS << "static constexpr unsigned " << TargetName << "ImpOpBase = sizeof "
+ << TargetName << "InstrTable::OperandInfo / (sizeof(MCPhysReg));\n\n";
+
+ OS << "extern const " << TargetName << "InstrTable " << TargetName
+ << "Descs = {\n {\n";
+ SequenceToOffsetTable<StringRef> InstrNames;
+ unsigned Num = NumberedInstructions.size();
+ for (const CodeGenInstruction *Inst : reverse(NumberedInstructions)) {
+ // Keep a list of the instruction names.
+ InstrNames.add(Inst->getName());
+ // Emit the record into the table.
+ emitRecord(*Inst, --Num, InstrInfo, EmittedLists, OperandInfoMap, OS);
+ }
+
+ OS << " }, {\n";
+
+ // Emit all of the operand info records.
+ Timer.startTimer("Emit operand info");
+ EmitOperandInfo(OS, OperandInfoList);
+
+ OS << " }, {\n";
+
+ // Emit all of the instruction's implicit uses and defs.
+ Timer.startTimer("Emit uses/defs");
+ for (auto &List : ImplicitLists) {
+ OS << " /* " << EmittedLists[List] << " */";
+ for (auto &Reg : List)
+ OS << ' ' << getQualifiedName(Reg) << ',';
+ OS << '\n';
+ }
+
+ OS << " }\n};\n\n";
+
+ // Emit the array of instruction names.
+ Timer.startTimer("Emit instruction names");
+ InstrNames.layout();
+ InstrNames.emitStringLiteralDef(OS, Twine("extern const char ") +
+ TargetName + "InstrNameData[]");
+ OS << "extern const unsigned " << TargetName << "InstrNameIndices[] = {";
Num = 0;
for (const CodeGenInstruction *Inst : NumberedInstructions) {
+ // Newline every eight entries.
if (Num % 8 == 0)
OS << "\n ";
- if (Inst->HasComplexDeprecationPredicate)
- // Emit a function pointer to the complex predicate method.
- OS << "&get" << Inst->DeprecatedReason << "DeprecationInfo, ";
- else
- OS << "nullptr, ";
+ OS << InstrNames.get(Inst->getName()) << "U, ";
++Num;
}
OS << "\n};\n\n";
- }
-
- // MCInstrInfo initialization routine.
- Timer.startTimer("Emit initialization routine");
-
- if (NumClassesByHwMode != 0) {
- OS << "extern const int16_t " << TargetName << "RegClassByHwModeTables["
- << NumModes << "][" << NumClassesByHwMode << "] = {\n";
- for (unsigned M = 0; M < NumModes; ++M) {
- OS << " { // " << CGH.getModeName(M, /*IncludeDefault=*/true) << '\n';
- for (unsigned I = 0; I != NumClassesByHwMode; ++I) {
- const Record *Class = RegClassByHwMode[I];
- const HwModeSelect &ModeSelect = CGH.getHwModeSelect(Class);
-
- auto FoundMode =
- find_if(ModeSelect.Items, [=](const HwModeSelect::PairType P) {
- return P.first == M;
- });
-
- if (FoundMode == ModeSelect.Items.end()) {
- // If a RegClassByHwMode doesn't have an entry corresponding to a
- // mode, pad with default register class.
- OS << indent(4) << "-1, // Missing mode entry\n";
- } else {
- const CodeGenRegisterClass *RegClass =
- RegBank.getRegClass(FoundMode->second);
- OS << indent(4) << RegClass->getQualifiedIdName() << ",\n";
- }
+ if (HasDeprecationFeatures) {
+ OS << "extern const uint8_t " << TargetName
+ << "InstrDeprecationFeatures[] = {";
+ Num = 0;
+ for (const CodeGenInstruction *Inst : NumberedInstructions) {
+ if (Num % 8 == 0)
+ OS << "\n ";
+ if (!Inst->HasComplexDeprecationPredicate &&
+ !Inst->DeprecatedReason.empty())
+ OS << Target.getInstNamespace() << "::" << Inst->DeprecatedReason
+ << ", ";
+ else
+ OS << "uint8_t(-1), ";
+ ++Num;
}
-
- OS << " },\n";
+ OS << "\n};\n\n";
}
- OS << "};\n\n";
- }
+ if (HasComplexDeprecationInfos) {
+ OS << "extern const MCInstrInfo::ComplexDeprecationPredicate "
+ << TargetName << "InstrComplexDeprecationInfos[] = {";
+ Num = 0;
+ for (const CodeGenInstruction *Inst : NumberedInstructions) {
+ if (Num % 8 == 0)
+ OS << "\n ";
+ if (Inst->HasComplexDeprecationPredicate)
+ // Emit a function pointer to the complex predicate method.
+ OS << "&get" << Inst->DeprecatedReason << "DeprecationInfo, ";
+ else
+ OS << "nullptr, ";
+ ++Num;
+ }
+ OS << "\n};\n\n";
+ }
- OS << "static inline void Init" << TargetName
- << "MCInstrInfo(MCInstrInfo *II) {\n";
- OS << " II->InitMCInstrInfo(" << TargetName << "Descs.Insts, " << TargetName
- << "InstrNameIndices, " << TargetName << "InstrNameData, ";
- if (HasDeprecationFeatures)
- OS << TargetName << "InstrDeprecationFeatures, ";
- else
- OS << "nullptr, ";
- if (HasComplexDeprecationInfos)
- OS << TargetName << "InstrComplexDeprecationInfos, ";
- else
- OS << "nullptr, ";
- OS << NumberedInstructions.size() << ", ";
-
- if (NumClassesByHwMode != 0) {
- OS << '&' << TargetName << "RegClassByHwModeTables[0][0], "
- << NumClassesByHwMode;
- } else
- OS << "nullptr, 0";
-
- OS << ");\n}\n\n";
+ // MCInstrInfo initialization routine.
+ Timer.startTimer("Emit initialization routine");
+
+ if (NumClassesByHwMode != 0) {
+ OS << "extern const int16_t " << TargetName << "RegClassByHwModeTables["
+ << NumModes << "][" << NumClassesByHwMode << "] = {\n";
+
+ for (unsigned M = 0; M < NumModes; ++M) {
+ OS << " { // " << CGH.getModeName(M, /*IncludeDefault=*/true) << '\n';
+ for (unsigned I = 0; I != NumClassesByHwMode; ++I) {
+ const Record *Class = RegClassByHwMode[I];
+ const HwModeSelect &ModeSelect = CGH.getHwModeSelect(Class);
+
+ auto FoundMode =
+ find_if(ModeSelect.Items, [=](const HwModeSelect::PairType P) {
+ return P.first == M;
+ });
+
+ if (FoundMode == ModeSelect.Items.end()) {
+ // If a RegClassByHwMode doesn't have an entry corresponding to a
+ // mode, pad with default register class.
+ OS << indent(4) << "-1, // Missing mode entry\n";
+ } else {
+ const CodeGenRegisterClass *RegClass =
+ RegBank.getRegClass(FoundMode->second);
+ OS << indent(4) << RegClass->getQualifiedIdName() << ",\n";
+ }
+ }
- OS << "} // end namespace llvm\n";
+ OS << " },\n";
+ }
- OS << "#endif // GET_INSTRINFO_MC_DESC\n\n";
+ OS << "};\n\n";
+ }
- // Create a TargetInstrInfo subclass to hide the MC layer initialization.
- OS << "#ifdef GET_INSTRINFO_HEADER\n";
- OS << "#undef GET_INSTRINFO_HEADER\n";
+ OS << "static inline void Init" << TargetName
+ << "MCInstrInfo(MCInstrInfo *II) {\n";
+ OS << " II->InitMCInstrInfo(" << TargetName << "Descs.Insts, "
+ << TargetName << "InstrNameIndices, " << TargetName << "InstrNameData, ";
+ if (HasDeprecationFeatures)
+ OS << TargetName << "InstrDeprecationFeatures, ";
+ else
+ OS << "nullptr, ";
+ if (HasComplexDeprecationInfos)
+ OS << TargetName << "InstrComplexDeprecationInfos, ";
+ else
+ OS << "nullptr, ";
+ OS << NumberedInstructions.size() << ", ";
- Twine ClassName = TargetName + "GenInstrInfo";
- OS << "namespace llvm {\n";
- OS << "struct " << ClassName << " : public TargetInstrInfo {\n"
- << " explicit " << ClassName
- << "(const TargetSubtargetInfo &STI, unsigned CFSetupOpcode = ~0u, "
- "unsigned CFDestroyOpcode = ~0u, "
- "unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u);\n"
- << " ~" << ClassName << "() override = default;\n";
+ if (NumClassesByHwMode != 0) {
+ OS << '&' << TargetName << "RegClassByHwModeTables[0][0], "
+ << NumClassesByHwMode;
+ } else
+ OS << "nullptr, 0";
- OS << "\n};\n} // end namespace llvm\n";
+ OS << ");\n}\n\n";
+ } // end GET_INSTRINFO_MC_DESC scope.
{
- NamespaceEmitter LlvmNS(OS, "llvm");
- NamespaceEmitter TargetNS(OS, Target.getInstNamespace());
+ // Create a TargetInstrInfo subclass to hide the MC layer initialization.
+ IfDefEmitter IfDef(OS, "GET_INSTRINFO_HEADER");
+ {
+ NamespaceEmitter LlvmNS(OS, "llvm");
+ Twine ClassName = TargetName + "GenInstrInfo";
+ OS << "struct " << ClassName << " : public TargetInstrInfo {\n"
+ << " explicit " << ClassName
+ << "(const TargetSubtargetInfo &STI, unsigned CFSetupOpcode = ~0u, "
+ "unsigned CFDestroyOpcode = ~0u, "
+ "unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u);\n"
+ << " ~" << ClassName << "() override = default;\n"
+ << "};\n";
+ } // end llvm namespace.
+
+ OS << "\n";
+ NamespaceEmitter InstNS(OS, ("llvm::" + Target.getInstNamespace()).str());
for (const Record *R : Records.getAllDerivedDefinitions("Operand")) {
if (R->isAnonymous())
continue;
- if (const DagInit *D = R->getValueAsDag("MIOperandInfo")) {
- for (unsigned i = 0, e = D->getNumArgs(); i < e; ++i) {
- if (const StringInit *Name = D->getArgName(i))
- OS << "constexpr unsigned SUBOP_" << R->getName() << "_"
- << Name->getValue() << " = " << i << ";\n";
- }
+ const DagInit *D = R->getValueAsDag("MIOperandInfo");
+ if (!D)
+ continue;
+ for (const auto &[Idx, Name] : enumerate(D->getArgNames())) {
+ if (Name)
+ OS << "constexpr unsigned SUBOP_" << R->getName() << "_"
+ << Name->getValue() << " = " << Idx << ";\n";
}
}
- }
-
- OS << "#endif // GET_INSTRINFO_HEADER\n\n";
-
- OS << "#ifdef GET_INSTRINFO_HELPER_DECLS\n";
- OS << "#undef GET_INSTRINFO_HELPER_DECLS\n\n";
- emitTIIHelperMethods(OS, TargetName, /* ExpandDefinition = */ false);
- OS << '\n';
- OS << "#endif // GET_INSTRINFO_HELPER_DECLS\n\n";
-
- OS << "#ifdef GET_INSTRINFO_HELPERS\n";
- OS << "#undef GET_INSTRINFO_HELPERS\n\n";
- emitTIIHelperMethods(OS, TargetName, /* ExpandDefinition = */ true);
- OS << "#endif // GET_INSTRINFO_HELPERS\n\n";
+ } // end GET_INSTRINFO_HEADER scope.
- OS << "#ifdef GET_INSTRINFO_CTOR_DTOR\n";
- OS << "#undef GET_INSTRINFO_CTOR_DTOR\n";
-
- OS << "namespace llvm {\n";
- OS << "extern const " << TargetName << "InstrTable " << TargetName
- << "Descs;\n";
- OS << "extern const unsigned " << TargetName << "InstrNameIndices[];\n";
- OS << "extern const char " << TargetName << "InstrNameData[];\n";
-
- if (NumClassesByHwMode != 0) {
- OS << "extern const int16_t " << TargetName << "RegClassByHwModeTables["
- << NumModes << "][" << NumClassesByHwMode << "];\n";
+ {
+ IfDefEmitter IfDef(OS, "GET_INSTRINFO_HELPER_DECLS");
+ emitTIIHelperMethods(OS, TargetName, /* ExpandDefinition = */ false);
}
- if (HasDeprecationFeatures)
- OS << "extern const uint8_t " << TargetName
- << "InstrDeprecationFeatures[];\n";
- if (HasComplexDeprecationInfos)
- OS << "extern const MCInstrInfo::ComplexDeprecationPredicate " << TargetName
- << "InstrComplexDeprecationInfos[];\n";
- OS << ClassName << "::" << ClassName
- << "(const TargetSubtargetInfo &STI, unsigned CFSetupOpcode, unsigned "
- "CFDestroyOpcode, unsigned CatchRetOpcode, unsigned ReturnOpcode)\n"
- << " : TargetInstrInfo(CFSetupOpcode, CFDestroyOpcode, CatchRetOpcode, "
- "ReturnOpcode";
- if (NumClassesByHwMode != 0)
- OS << ", " << TargetName
- << "RegClassByHwModeTables[STI.getHwMode(MCSubtargetInfo::HwMode_"
- "RegInfo)]";
-
- OS << ") {\n"
- << " InitMCInstrInfo(" << TargetName << "Descs.Insts, " << TargetName
- << "InstrNameIndices, " << TargetName << "InstrNameData, ";
- if (HasDeprecationFeatures)
- OS << TargetName << "InstrDeprecationFeatures, ";
- else
- OS << "nullptr, ";
- if (HasComplexDeprecationInfos)
- OS << TargetName << "InstrComplexDeprecationInfos, ";
- else
- OS << "nullptr, ";
- OS << NumberedInstructions.size();
-
- if (NumClassesByHwMode != 0) {
- OS << ", &" << TargetName << "RegClassByHwModeTables[0][0], "
- << NumClassesByHwMode;
+ {
+ IfDefEmitter IfDef(OS, "GET_INSTRINFO_HELPERS");
+ emitTIIHelperMethods(OS, TargetName, /* ExpandDefinition = */ true);
}
- OS << ");\n"
- "}\n"
- "} // end namespace llvm\n";
+ {
+ IfDefEmitter IfDef(OS, "GET_INSTRINFO_CTOR_DTOR");
+ NamespaceEmitter LlvmNS(OS, "llvm");
+ OS << "extern const " << TargetName << "InstrTable " << TargetName
+ << "Descs;\n";
+ OS << "extern const unsigned " << TargetName << "InstrNameIndices[];\n";
+ OS << "extern const char " << TargetName << "InstrNameData[];\n";
+
+ if (NumClassesByHwMode != 0) {
+ OS << "extern const int16_t " << TargetName << "RegClassByHwModeTables["
+ << NumModes << "][" << NumClassesByHwMode << "];\n";
+ }
+
+ if (HasDeprecationFeatures)
+ OS << "extern const uint8_t " << TargetName
+ << "InstrDeprecationFeatures[];\n";
+ if (HasComplexDeprecationInfos)
+ OS << "extern const MCInstrInfo::ComplexDeprecationPredicate "
+ << TargetName << "InstrComplexDeprecationInfos[];\n";
+ Twine ClassName = TargetName + "GenInstrInfo";
+ OS << ClassName << "::" << ClassName
+ << "(const TargetSubtargetInfo &STI, unsigned CFSetupOpcode, unsigned "
+ "CFDestroyOpcode, unsigned CatchRetOpcode, unsigned ReturnOpcode)\n"
+ << " : TargetInstrInfo(CFSetupOpcode, CFDestroyOpcode, CatchRetOpcode, "
+ "ReturnOpcode";
+ if (NumClassesByHwMode != 0)
+ OS << ", " << TargetName
+ << "RegClassByHwModeTables[STI.getHwMode(MCSubtargetInfo::HwMode_"
+ "RegInfo)]";
+
+ OS << ") {\n"
+ << " InitMCInstrInfo(" << TargetName << "Descs.Insts, " << TargetName
+ << "InstrNameIndices, " << TargetName << "InstrNameData, ";
+ if (HasDeprecationFeatures)
+ OS << TargetName << "InstrDeprecationFeatures, ";
+ else
+ OS << "nullptr, ";
+ if (HasComplexDeprecationInfos)
+ OS << TargetName << "InstrComplexDeprecationInfos, ";
+ else
+ OS << "nullptr, ";
+ OS << NumberedInstructions.size();
- OS << "#endif // GET_INSTRINFO_CTOR_DTOR\n\n";
+ if (NumClassesByHwMode != 0) {
+ OS << ", &" << TargetName << "RegClassByHwModeTables[0][0], "
+ << NumClassesByHwMode;
+ }
+
+ OS << ");\n"
+ "}\n";
+ } // end GET_INSTRINFO_CTOR_DTOR scope.
ArrayRef<const CodeGenInstruction *> TargetInstructions =
Target.getTargetInstructions();
@@ -1384,8 +1353,6 @@ void InstrInfoEmitter::emitRecord(
void InstrInfoEmitter::emitEnums(
raw_ostream &OS,
ArrayRef<const CodeGenInstruction *> NumberedInstructions) {
- OS << "#ifdef GET_INSTRINFO_ENUM\n";
- OS << "#undef GET_INSTRINFO_ENUM\n";
const CodeGenTarget &Target = CDP.getTargetInfo();
StringRef Namespace = Target.getInstNamespace();
@@ -1393,48 +1360,48 @@ void InstrInfoEmitter::emitEnums(
if (Namespace.empty())
PrintFatalError("No instructions defined!");
- OS << "namespace llvm::" << Namespace << " {\n";
+ {
+ IfDefEmitter IfDef(OS, "GET_INSTRINFO_ENUM");
+ NamespaceEmitter NS(OS, ("llvm::" + Namespace).str());
- auto II = llvm::max_element(
- NumberedInstructions,
- [](const CodeGenInstruction *InstA, const CodeGenInstruction *InstB) {
- return InstA->getName().size() < InstB->getName().size();
- });
- size_t MaxNameSize = (*II)->getName().size();
+ auto II = llvm::max_element(
+ NumberedInstructions,
+ [](const CodeGenInstruction *InstA, const CodeGenInstruction *InstB) {
+ return InstA->getName().size() < InstB->getName().size();
+ });
+ size_t MaxNameSize = (*II)->getName().size();
- OS << " enum {\n";
- for (const CodeGenInstruction *Inst : NumberedInstructions) {
- OS << " " << left_justify(Inst->getName(), MaxNameSize) << " = "
- << Target.getInstrIntValue(Inst->TheDef) << ", // "
- << SrcMgr.getFormattedLocationNoOffset(Inst->TheDef->getLoc().front())
- << '\n';
+ OS << " enum {\n";
+ for (const CodeGenInstruction *Inst : NumberedInstructions) {
+ OS << " " << left_justify(Inst->getName(), MaxNameSize) << " = "
+ << Target.getInstrIntValue(Inst->TheDef) << ", // "
+ << SrcMgr.getFormattedLocationNoOffset(Inst->TheDef->getLoc().front())
+ << '\n';
+ }
+ OS << " INSTRUCTION_LIST_END = " << NumberedInstructions.size() << '\n';
+ OS << " };\n";
+
+ ArrayRef<const Record *> RegClassesByHwMode =
+ Target.getAllRegClassByHwMode();
+ if (!RegClassesByHwMode.empty()) {
+ OS << " enum RegClassByHwModeUses : uint16_t {\n";
+ for (const Record *ClassByHwMode : RegClassesByHwMode)
+ OS << indent(4) << ClassByHwMode->getName() << ",\n";
+ OS << " };\n";
+ }
}
- OS << " INSTRUCTION_LIST_END = " << NumberedInstructions.size() << '\n';
- OS << " };\n";
- ArrayRef<const Record *> RegClassesByHwMode = Target.getAllRegClassByHwMode();
- if (!RegClassesByHwMode.empty()) {
- OS << " enum RegClassByHwModeUses : uint16_t {\n";
- for (const Record *ClassByHwMode : RegClassesByHwMode)
- OS << indent(4) << ClassByHwMode->getName() << ",\n";
+ {
+ IfDefEmitter IfDef(OS, "GET_INSTRINFO_SCHED_ENUM");
+ NamespaceEmitter NS(OS, ("llvm::" + Namespace + "::Sched").str());
+
+ OS << " enum {\n";
+ auto ExplictClasses = SchedModels.explicitSchedClasses();
+ for (const auto &[Idx, Class] : enumerate(ExplictClasses))
+ OS << " " << Class.Name << "\t= " << Idx << ",\n";
+ OS << " SCHED_LIST_END = " << ExplictClasses.size() << '\n';
OS << " };\n";
}
-
- OS << "} // end namespace llvm::" << Namespace << '\n';
- OS << "#endif // GET_INSTRINFO_ENUM\n\n";
-
- OS << "#ifdef GET_INSTRINFO_SCHED_ENUM\n";
- OS << "#undef GET_INSTRINFO_SCHED_ENUM\n";
- OS << "namespace llvm::" << Namespace << "::Sched {\n";
- OS << " enum {\n";
- auto ExplictClasses = SchedModels.explicitSchedClasses();
- for (const auto &[Idx, Class] : enumerate(ExplictClasses))
- OS << " " << Class.Name << "\t= " << Idx << ",\n";
- OS << " SCHED_LIST_END = " << ExplictClasses.size() << '\n';
- OS << " };\n";
- OS << "} // end namespace llvm::" << Namespace << "::Sched\n";
-
- OS << "#endif // GET_INSTRINFO_SCHED_ENUM\n\n";
}
static TableGen::Emitter::OptClass<InstrInfoEmitter>
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 9e0b951..27bd2ce 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -336,6 +336,7 @@ if (current_toolchain == default_toolchain) {
"__chrono/gps_clock.h",
"__chrono/hh_mm_ss.h",
"__chrono/high_resolution_clock.h",
+ "__chrono/is_clock.h",
"__chrono/leap_second.h",
"__chrono/literals.h",
"__chrono/local_info.h",
diff --git a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn
index 783eb96..679373d 100644
--- a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn
@@ -72,6 +72,7 @@ static_library("Target") {
"Statistics.cpp",
"StopInfo.cpp",
"StructuredDataPlugin.cpp",
+ "SyntheticFrameProvider.cpp",
"SystemRuntime.cpp",
"Target.cpp",
"TargetList.cpp",
diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake
index fa6aec8..8196e2a 100644
--- a/mlir/cmake/modules/AddMLIRPython.cmake
+++ b/mlir/cmake/modules/AddMLIRPython.cmake
@@ -791,7 +791,6 @@ function(add_mlir_python_extension libname extname)
get_property(NB_LIBRARY_TARGET_NAME TARGET ${libname} PROPERTY LINK_LIBRARIES)
target_compile_options(${NB_LIBRARY_TARGET_NAME}
PRIVATE
- -Wall -Wextra -Wpedantic
-Wno-c++98-compat-extra-semi
-Wno-cast-qual
-Wno-covered-switch-default
@@ -799,11 +798,11 @@ function(add_mlir_python_extension libname extname)
-Wno-nested-anon-types
-Wno-unused-parameter
-Wno-zero-length-array
+ -Wno-missing-field-initializers
${eh_rtti_enable})
target_compile_options(${libname}
PRIVATE
- -Wall -Wextra -Wpedantic
-Wno-c++98-compat-extra-semi
-Wno-cast-qual
-Wno-covered-switch-default
@@ -811,6 +810,7 @@ function(add_mlir_python_extension libname extname)
-Wno-nested-anon-types
-Wno-unused-parameter
-Wno-zero-length-array
+ -Wno-missing-field-initializers
${eh_rtti_enable})
endif()
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td
index 246ae77..c7775f2 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td
@@ -138,7 +138,7 @@ def BufferizableOpInterface : OpInterface<"BufferizableOpInterface"> {
/*retType=*/"bool",
/*methodName=*/"bufferizesToElementwiseAccess",
/*args=*/(ins "const ::mlir::bufferization::AnalysisState &":$state,
- "ArrayRef<OpOperand *>":$opOperands),
+ "::llvm::ArrayRef<::mlir::OpOperand *>":$opOperands),
/*methodBody=*/"",
/*defaultImplementation=*/[{
// It is always safe to assume that the op is not element-wise.
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h b/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h
index 69447f7..b7abcde 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h
@@ -13,6 +13,7 @@
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
#include "mlir/IR/Dialect.h"
+#include "mlir/Interfaces/AlignmentAttrInterface.h"
#include "mlir/Interfaces/CallInterfaces.h"
#include "mlir/Interfaces/CastInterfaces.h"
#include "mlir/Interfaces/ControlFlowInterfaces.h"
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index e00f3c1..8965302 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -11,6 +11,7 @@
include "mlir/Dialect/Arith/IR/ArithBase.td"
include "mlir/Dialect/MemRef/IR/MemRefBase.td"
+include "mlir/Interfaces/AlignmentAttrInterface.td"
include "mlir/Interfaces/CastInterfaces.td"
include "mlir/Interfaces/ControlFlowInterfaces.td"
include "mlir/Interfaces/InferIntRangeInterface.td"
@@ -65,15 +66,15 @@ class AllocLikeOp<string mnemonic,
list<Trait> traits = []> :
MemRef_Op<mnemonic,
!listconcat([
- AttrSizedOperandSegments
+ AttrSizedOperandSegments,
+ DeclareOpInterfaceMethods<AlignmentAttrOpInterface>
], traits)> {
let arguments = (ins Variadic<Index>:$dynamicSizes,
// The symbolic operands (the ones in square brackets)
// bind to the symbols of the memref's layout map.
Variadic<Index>:$symbolOperands,
- ConfinedAttr<OptionalAttr<I64Attr>,
- [IntMinValue<0>]>:$alignment);
+ OptionalAttr<IntValidAlignment<I64Attr>>:$alignment);
let results = (outs Res<AnyMemRef, "",
[MemAlloc<resource, 0, FullEffect>]>:$memref);
@@ -269,7 +270,8 @@ def MemRef_AllocOp : AllocLikeOp<"alloc", DefaultResource, [
//===----------------------------------------------------------------------===//
-def MemRef_ReallocOp : MemRef_Op<"realloc"> {
+def MemRef_ReallocOp : MemRef_Op<"realloc",
+ [DeclareOpInterfaceMethods<AlignmentAttrOpInterface>]> {
let summary = "memory reallocation operation";
let description = [{
The `realloc` operation changes the size of a memory region. The memory
@@ -335,8 +337,7 @@ def MemRef_ReallocOp : MemRef_Op<"realloc"> {
let arguments = (ins Arg<MemRefRankOf<[AnyType], [1]>, "",
[MemFreeAt<0, FullEffect>]>:$source,
Optional<Index>:$dynamicResultSize,
- ConfinedAttr<OptionalAttr<I64Attr>,
- [IntMinValue<0>]>:$alignment);
+ OptionalAttr<IntValidAlignment<I64Attr>>:$alignment);
let results = (outs Res<MemRefRankOf<[AnyType], [1]>, "",
[MemAlloc<DefaultResource, 1,
@@ -1160,7 +1161,8 @@ def MemRef_GetGlobalOp : MemRef_Op<"get_global",
// GlobalOp
//===----------------------------------------------------------------------===//
-def MemRef_GlobalOp : MemRef_Op<"global", [Symbol]> {
+def MemRef_GlobalOp : MemRef_Op<"global", [Symbol,
+ DeclareOpInterfaceMethods<AlignmentAttrOpInterface>]> {
let summary = "declare or define a global memref variable";
let description = [{
The `memref.global` operation declares or defines a named global memref
@@ -1235,6 +1237,7 @@ def LoadOp : MemRef_Op<"load",
"memref", "result",
"::llvm::cast<MemRefType>($_self).getElementType()">,
MemRefsNormalizable,
+ DeclareOpInterfaceMethods<AlignmentAttrOpInterface>,
DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
DeclareOpInterfaceMethods<PromotableMemOpInterface>,
DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>]> {
@@ -2010,6 +2013,7 @@ def MemRef_StoreOp : MemRef_Op<"store",
"memref", "value",
"::llvm::cast<MemRefType>($_self).getElementType()">,
MemRefsNormalizable,
+ DeclareOpInterfaceMethods<AlignmentAttrOpInterface>,
DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
DeclareOpInterfaceMethods<PromotableMemOpInterface>,
DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>]> {
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index c689b7e..5b89f74 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2184,6 +2184,8 @@ def OpenACC_KernelEnvironmentOp : OpenACC_Op<"kernel_environment",
)
$region attr-dict
}];
+
+ let hasCanonicalizer = 1;
}
//===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.h b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.h
index 57d532b..27f65aa 100644
--- a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.h
@@ -9,6 +9,9 @@
#ifndef MLIR_DIALECT_OPENACC_TRANSFORMS_PASSES_H
#define MLIR_DIALECT_OPENACC_TRANSFORMS_PASSES_H
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
#include "mlir/Pass/Pass.h"
namespace mlir {
diff --git a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
index 9ceb91e5..40ccd1f 100644
--- a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
@@ -27,4 +27,40 @@ def LegalizeDataValuesInRegion : Pass<"openacc-legalize-data-values", "mlir::fun
];
}
+def ACCImplicitData : Pass<"acc-implicit-data", "mlir::ModuleOp"> {
+ let summary = "Generate implicit data attributes for OpenACC compute constructs";
+ let description = [{
+ This pass implements the OpenACC specification for "Variables with
+ Implicitly Determined Data Attributes" (OpenACC 3.4 spec, section 2.6.2).
+
+ The pass automatically generates data clause operations for variables used
+ within OpenACC compute constructs (parallel, kernels, serial) that do not
+ already have explicit data clauses. The semantics follow these rules:
+
+ 1. If there is a default(none) clause visible, no implicit data actions
+ apply.
+
+ 2. An aggregate variable (arrays, derived types, etc.) will be treated as:
+ - In a present clause when default(present) is visible.
+ - In a copy clause otherwise.
+
+ 3. A scalar variable will be treated as if it appears in:
+ - A copy clause if the compute construct is a kernels construct.
+ - A firstprivate clause otherwise (parallel, serial).
+ }];
+ let dependentDialects = ["mlir::acc::OpenACCDialect",
+ "mlir::memref::MemRefDialect",
+ "mlir::arith::ArithDialect"];
+ let options = [
+ Option<"enableImplicitReductionCopy", "enable-implicit-reduction-copy",
+ "bool", "true",
+ "Enable applying implicit copy in lieu of implicit firstprivate for "
+ "reduction variables. This allows uniform treatment of reduction "
+ "variables between combined constructs (e.g., 'parallel loop') and "
+ "separate constructs (e.g., 'parallel' followed by 'loop'), where "
+ "the OpenACC spec requires copy semantics for the former but "
+ "firstprivate would normally apply for the latter.">
+ ];
+}
+
#endif // MLIR_DIALECT_OPENACC_TRANSFORMS_PASSES
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCooperativeMatrixOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCooperativeMatrixOps.td
index 827ac90..e8124b8 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCooperativeMatrixOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCooperativeMatrixOps.td
@@ -16,6 +16,8 @@
#ifndef MLIR_DIALECT_SPIRV_IR_COOPERATIVE_MATRIX_OPS
#define MLIR_DIALECT_SPIRV_IR_COOPERATIVE_MATRIX_OPS
+include "mlir/Interfaces/AlignmentAttrInterface.td"
+
//===----------------------------------------------------------------------===//
// SPV_KHR_cooperative_matrix extension ops.
//===----------------------------------------------------------------------===//
@@ -62,7 +64,7 @@ def SPIRV_KHRCooperativeMatrixLengthOp :
// -----
-def SPIRV_KHRCooperativeMatrixLoadOp : SPIRV_KhrVendorOp<"CooperativeMatrixLoad", []> {
+def SPIRV_KHRCooperativeMatrixLoadOp : SPIRV_KhrVendorOp<"CooperativeMatrixLoad", [DeclareOpInterfaceMethods<AlignmentAttrOpInterface>]> {
let summary = "Loads a cooperative matrix through a pointer";
let description = [{
@@ -148,7 +150,7 @@ def SPIRV_KHRCooperativeMatrixLoadOp : SPIRV_KhrVendorOp<"CooperativeMatrixLoad"
// -----
-def SPIRV_KHRCooperativeMatrixStoreOp : SPIRV_KhrVendorOp<"CooperativeMatrixStore", []> {
+def SPIRV_KHRCooperativeMatrixStoreOp : SPIRV_KhrVendorOp<"CooperativeMatrixStore", [DeclareOpInterfaceMethods<AlignmentAttrOpInterface>]> {
let summary = "Stores a cooperative matrix through a pointer";
let description = [{
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td
index 6108dec..0b3d70f 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td
@@ -15,6 +15,8 @@
#define MLIR_DIALECT_SPIRV_IR_MEMORY_OPS
include "mlir/Dialect/SPIRV/IR/SPIRVBase.td"
+include "mlir/Interfaces/AlignmentAttrInterface.td"
+
// -----
@@ -79,7 +81,7 @@ def SPIRV_AccessChainOp : SPIRV_Op<"AccessChain", [Pure]> {
// -----
-def SPIRV_CopyMemoryOp : SPIRV_Op<"CopyMemory", []> {
+def SPIRV_CopyMemoryOp : SPIRV_Op<"CopyMemory", [DeclareOpInterfaceMethods<AlignmentAttrOpInterface>]> {
let summary = [{
Copy from the memory pointed to by Source to the memory pointed to by
Target. Both operands must be non-void pointers and having the same <id>
@@ -182,7 +184,7 @@ def SPIRV_InBoundsPtrAccessChainOp : SPIRV_Op<"InBoundsPtrAccessChain", [Pure]>
// -----
-def SPIRV_LoadOp : SPIRV_Op<"Load", []> {
+def SPIRV_LoadOp : SPIRV_Op<"Load", [DeclareOpInterfaceMethods<AlignmentAttrOpInterface>]> {
let summary = "Load through a pointer.";
let description = [{
@@ -310,7 +312,7 @@ def SPIRV_PtrAccessChainOp : SPIRV_Op<"PtrAccessChain", [Pure]> {
// -----
-def SPIRV_StoreOp : SPIRV_Op<"Store", []> {
+def SPIRV_StoreOp : SPIRV_Op<"Store", [DeclareOpInterfaceMethods<AlignmentAttrOpInterface>]> {
let summary = "Store through a pointer.";
let description = [{
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.h b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.h
index 2676e92..0e1f6e7 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.h
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.h
@@ -20,6 +20,7 @@
#include "mlir/Dialect/SPIRV/Interfaces/SPIRVImageInterfaces.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/OpImplementation.h"
+#include "mlir/Interfaces/AlignmentAttrInterface.h"
#include "mlir/Interfaces/CallInterfaces.h"
#include "mlir/Interfaces/ControlFlowInterfaces.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
index bbf55f5..b3a0653 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
@@ -23,6 +23,7 @@
#include "mlir/IR/Dialect.h"
#include "mlir/IR/OpDefinition.h"
#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/AlignmentAttrInterface.h"
#include "mlir/Interfaces/ControlFlowInterfaces.h"
#include "mlir/Interfaces/DestinationStyleOpInterface.h"
#include "mlir/Interfaces/IndexingMapOpInterface.h"
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 6e15b1e..43172ff 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -19,6 +19,7 @@ include "mlir/Dialect/Vector/Interfaces/MaskableOpInterface.td"
include "mlir/Dialect/Vector/Interfaces/MaskingOpInterface.td"
include "mlir/Dialect/Vector/IR/Vector.td"
include "mlir/Dialect/Vector/IR/VectorAttributes.td"
+include "mlir/Interfaces/AlignmentAttrInterface.td"
include "mlir/Interfaces/ControlFlowInterfaces.td"
include "mlir/Interfaces/DestinationStyleOpInterface.td"
include "mlir/Interfaces/IndexingMapOpInterface.td"
@@ -1653,7 +1654,8 @@ def Vector_TransferWriteOp :
def Vector_LoadOp : Vector_Op<"load", [
DeclareOpInterfaceMethods<VectorUnrollOpInterface, ["getShapeForUnroll"]>,
- DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>
+ DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
+ DeclareOpInterfaceMethods<AlignmentAttrOpInterface>
]> {
let summary = "reads an n-D slice of memory into an n-D vector";
let description = [{
@@ -1770,7 +1772,8 @@ def Vector_LoadOp : Vector_Op<"load", [
def Vector_StoreOp : Vector_Op<"store", [
DeclareOpInterfaceMethods<VectorUnrollOpInterface, ["getShapeForUnroll"]>,
- DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>
+ DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
+ DeclareOpInterfaceMethods<AlignmentAttrOpInterface>
]> {
let summary = "writes an n-D vector to an n-D slice of memory";
let description = [{
@@ -1875,7 +1878,10 @@ def Vector_StoreOp : Vector_Op<"store", [
}
def Vector_MaskedLoadOp :
- Vector_Op<"maskedload", [DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>]>,
+ Vector_Op<"maskedload", [
+ DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
+ DeclareOpInterfaceMethods<AlignmentAttrOpInterface>
+ ]>,
Arguments<(ins Arg<AnyMemRef, "", [MemRead]>:$base,
Variadic<Index>:$indices,
VectorOfNonZeroRankOf<[I1]>:$mask,
@@ -1967,7 +1973,10 @@ def Vector_MaskedLoadOp :
}
def Vector_MaskedStoreOp :
- Vector_Op<"maskedstore", [DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>]>,
+ Vector_Op<"maskedstore", [
+ DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
+ DeclareOpInterfaceMethods<AlignmentAttrOpInterface>
+ ]>,
Arguments<(ins Arg<AnyMemRef, "", [MemWrite]>:$base,
Variadic<Index>:$indices,
VectorOfNonZeroRankOf<[I1]>:$mask,
@@ -2048,7 +2057,8 @@ def Vector_GatherOp :
Vector_Op<"gather", [
DeclareOpInterfaceMethods<MaskableOpInterface>,
DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
- DeclareOpInterfaceMethods<VectorUnrollOpInterface, ["getShapeForUnroll"]>
+ DeclareOpInterfaceMethods<VectorUnrollOpInterface, ["getShapeForUnroll"]>,
+ DeclareOpInterfaceMethods<AlignmentAttrOpInterface>
]>,
Arguments<(ins Arg<TensorOrMemRef<[AnyType]>, "", [MemRead]>:$base,
Variadic<Index>:$offsets,
@@ -2151,7 +2161,10 @@ def Vector_GatherOp :
}
def Vector_ScatterOp :
- Vector_Op<"scatter", [DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>]>,
+ Vector_Op<"scatter", [
+ DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
+ DeclareOpInterfaceMethods<AlignmentAttrOpInterface>
+ ]>,
Arguments<(ins Arg<AnyMemRef, "", [MemWrite]>:$base,
Variadic<Index>:$offsets,
VectorOfNonZeroRankOf<[AnyInteger, Index]>:$indices,
@@ -2236,7 +2249,10 @@ def Vector_ScatterOp :
}
def Vector_ExpandLoadOp :
- Vector_Op<"expandload", [DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>]>,
+ Vector_Op<"expandload", [
+ DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
+ DeclareOpInterfaceMethods<AlignmentAttrOpInterface>
+ ]>,
Arguments<(ins Arg<AnyMemRef, "", [MemRead]>:$base,
Variadic<Index>:$indices,
FixedVectorOfNonZeroRankOf<[I1]>:$mask,
@@ -2324,7 +2340,10 @@ def Vector_ExpandLoadOp :
}
def Vector_CompressStoreOp :
- Vector_Op<"compressstore", [DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>]>,
+ Vector_Op<"compressstore", [
+ DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
+ DeclareOpInterfaceMethods<AlignmentAttrOpInterface>
+ ]>,
Arguments<(ins Arg<AnyMemRef, "", [MemWrite]>:$base,
Variadic<Index>:$indices,
FixedVectorOfNonZeroRankOf<[I1]>:$mask,
diff --git a/mlir/include/mlir/Interfaces/AlignmentAttrInterface.h b/mlir/include/mlir/Interfaces/AlignmentAttrInterface.h
new file mode 100644
index 0000000..5b52c22
--- /dev/null
+++ b/mlir/include/mlir/Interfaces/AlignmentAttrInterface.h
@@ -0,0 +1,21 @@
+//===- AlignmentAttrInterface.h - Alignment attribute interface -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_INTERFACES_ALIGNMENTATTRINTERFACE_H
+#define MLIR_INTERFACES_ALIGNMENTATTRINTERFACE_H
+
+#include "mlir/IR/OpDefinition.h"
+#include "llvm/Support/Alignment.h"
+
+namespace mlir {
+class MLIRContext;
+} // namespace mlir
+
+#include "mlir/Interfaces/AlignmentAttrInterface.h.inc"
+
+#endif // MLIR_INTERFACES_ALIGNMENTATTRINTERFACE_H
diff --git a/mlir/include/mlir/Interfaces/AlignmentAttrInterface.td b/mlir/include/mlir/Interfaces/AlignmentAttrInterface.td
new file mode 100644
index 0000000..931af69
--- /dev/null
+++ b/mlir/include/mlir/Interfaces/AlignmentAttrInterface.td
@@ -0,0 +1,65 @@
+//===- AlignmentAttrInterface.td - Alignment attribute interface -*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an interface for operations that expose an optional
+// alignment attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_INTERFACES_ALIGNMENTATTRINTERFACE_TD
+#define MLIR_INTERFACES_ALIGNMENTATTRINTERFACE_TD
+
+include "mlir/IR/OpBase.td"
+
+def AlignmentAttrOpInterface : OpInterface<"AlignmentAttrOpInterface"> {
+ let description = [{
+ An interface for operations that carry an optional alignment attribute and
+ want to expose it as an `llvm::MaybeAlign` helper.
+ }];
+
+ let cppNamespace = "::mlir";
+
+ let methods = [
+ InterfaceMethod<[{
+ Returns the alignment encoded on the operation as an `llvm::MaybeAlign`.
+ Operations providing a differently named accessor can override the
+ default implementation.
+ }],
+ "::llvm::MaybeAlign",
+ "getMaybeAlign",
+ (ins),
+ [{
+ // Defensive: trait implementations are expected to validate power-of-two
+ // alignments, but we still guard against accidental misuse.
+ auto alignmentOpt = $_op.getAlignment();
+ if (!alignmentOpt || *alignmentOpt <= 0)
+ return ::llvm::MaybeAlign();
+ uint64_t value = static_cast<uint64_t>(*alignmentOpt);
+ if (!::llvm::isPowerOf2_64(value))
+ return ::llvm::MaybeAlign();
+ return ::llvm::MaybeAlign(value);
+ }]
+ >
+ ];
+
+ let extraTraitClassDeclaration = [{
+ ::llvm::MaybeAlign getMaybeAlign() {
+ // Defensive: trait implementations are expected to validate power-of-two
+ // alignments, but we still guard against accidental misuse.
+ auto alignmentOpt = (*static_cast<ConcreteOp *>(this)).getAlignment();
+ if (!alignmentOpt || *alignmentOpt <= 0)
+ return ::llvm::MaybeAlign();
+ uint64_t value = static_cast<uint64_t>(*alignmentOpt);
+ if (!::llvm::isPowerOf2_64(value))
+ return ::llvm::MaybeAlign();
+ return ::llvm::MaybeAlign(value);
+ }
+ }];
+}
+
+#endif // MLIR_INTERFACES_ALIGNMENTATTRINTERFACE_TD
diff --git a/mlir/include/mlir/Interfaces/CMakeLists.txt b/mlir/include/mlir/Interfaces/CMakeLists.txt
index 72ed046..eb96a68 100644
--- a/mlir/include/mlir/Interfaces/CMakeLists.txt
+++ b/mlir/include/mlir/Interfaces/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_mlir_interface(AlignmentAttrInterface)
add_mlir_interface(CallInterfaces)
add_mlir_interface(CastInterfaces)
add_mlir_interface(ControlFlowInterfaces)
diff --git a/mlir/include/mlir/TableGen/CodeGenHelpers.h b/mlir/include/mlir/TableGen/CodeGenHelpers.h
index 997aef2..b56172f 100644
--- a/mlir/include/mlir/TableGen/CodeGenHelpers.h
+++ b/mlir/include/mlir/TableGen/CodeGenHelpers.h
@@ -52,6 +52,15 @@ private:
std::optional<llvm::NamespaceEmitter> nsEmitter;
};
+/// This class represents how an error stream string being constructed will be
+/// consumed.
+enum class ErrorStreamType {
+ // Inside a string that's streamed into an InflightDiagnostic.
+ InString,
+ // Inside a string inside an OpError.
+ InsideOpError,
+};
+
/// This class deduplicates shared operation verification code by emitting
/// static functions alongside the op definitions. These methods are local to
/// the definition file, and are invoked within the operation verify methods.
@@ -192,7 +201,8 @@ private:
/// A generic function to emit constraints
void emitConstraints(const ConstraintMap &constraints, StringRef selfName,
- const char *codeTemplate);
+ const char *codeTemplate,
+ ErrorStreamType errorStreamType);
/// Assign a unique name to a unique constraint.
std::string getUniqueName(StringRef kind, unsigned index);
@@ -243,6 +253,18 @@ std::string stringify(T &&t) {
apply(std::forward<T>(t));
}
+/// Helper to generate a C++ streaming error message from a given message.
+/// Message can contain '{{...}}' placeholders that are substituted with
+/// C-expressions via tgfmt. It would effectively convert:
+/// "failed to verify {{foo}}"
+/// into:
+/// "failed to verify " << bar
+/// where bar is the result of evaluating 'tgfmt("foo", &ctx)' at compile
+/// time.
+std::string buildErrorStreamingString(
+ StringRef message, const FmtContext &ctx,
+ ErrorStreamType errorStreamType = ErrorStreamType::InString);
+
} // namespace tblgen
} // namespace mlir
diff --git a/mlir/lib/Dialect/Async/IR/Async.cpp b/mlir/lib/Dialect/Async/IR/Async.cpp
index 8e4a49d..e19b917 100644
--- a/mlir/lib/Dialect/Async/IR/Async.cpp
+++ b/mlir/lib/Dialect/Async/IR/Async.cpp
@@ -17,8 +17,6 @@ using namespace mlir::async;
#include "mlir/Dialect/Async/IR/AsyncOpsDialect.cpp.inc"
-constexpr StringRef AsyncDialect::kAllowedToBlockAttrName;
-
void AsyncDialect::initialize() {
addOperations<
#define GET_OP_LIST
diff --git a/mlir/lib/Dialect/DLTI/DLTI.cpp b/mlir/lib/Dialect/DLTI/DLTI.cpp
index 173d58b..da572f1 100644
--- a/mlir/lib/Dialect/DLTI/DLTI.cpp
+++ b/mlir/lib/Dialect/DLTI/DLTI.cpp
@@ -606,11 +606,6 @@ FailureOr<Attribute> dlti::query(Operation *op, ArrayRef<StringRef> keys,
return dlti::query(op, entryKeys, emitError);
}
-constexpr const StringLiteral mlir::DLTIDialect::kDataLayoutAttrName;
-constexpr const StringLiteral mlir::DLTIDialect::kDataLayoutEndiannessKey;
-constexpr const StringLiteral mlir::DLTIDialect::kDataLayoutEndiannessBig;
-constexpr const StringLiteral mlir::DLTIDialect::kDataLayoutEndiannessLittle;
-
namespace {
class TargetDataLayoutInterface : public DataLayoutDialectInterface {
public:
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 2731069..1bf4a1c 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -640,8 +640,6 @@ SuccessorOperands SwitchOp::getSuccessorOperands(unsigned index) {
// Code for LLVM::GEPOp.
//===----------------------------------------------------------------------===//
-constexpr int32_t GEPOp::kDynamicIndex;
-
GEPIndicesAdaptor<ValueRange> GEPOp::getIndices() {
return GEPIndicesAdaptor<ValueRange>(getRawConstantIndicesAttr(),
getDynamicIndices());
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index b2f1d84..8c9c137 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -1042,6 +1042,65 @@ struct RemoveConstantIfConditionWithRegion : public OpRewritePattern<OpTy> {
}
};
+/// Remove empty acc.kernel_environment operations. If the operation has wait
+/// operands, create a acc.wait operation to preserve synchronization.
+struct RemoveEmptyKernelEnvironment
+ : public OpRewritePattern<acc::KernelEnvironmentOp> {
+ using OpRewritePattern<acc::KernelEnvironmentOp>::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(acc::KernelEnvironmentOp op,
+ PatternRewriter &rewriter) const override {
+ assert(op->getNumRegions() == 1 && "expected op to have one region");
+
+ Block &block = op.getRegion().front();
+ if (!block.empty())
+ return failure();
+
+ // Conservatively disable canonicalization of empty acc.kernel_environment
+ // operations if the wait operands in the kernel_environment cannot be fully
+ // represented by acc.wait operation.
+
+ // Disable canonicalization if device type is not the default
+ if (auto deviceTypeAttr = op.getWaitOperandsDeviceTypeAttr()) {
+ for (auto attr : deviceTypeAttr) {
+ if (auto dtAttr = mlir::dyn_cast<acc::DeviceTypeAttr>(attr)) {
+ if (dtAttr.getValue() != mlir::acc::DeviceType::None)
+ return failure();
+ }
+ }
+ }
+
+ // Disable canonicalization if any wait segment has a devnum
+ if (auto hasDevnumAttr = op.getHasWaitDevnumAttr()) {
+ for (auto attr : hasDevnumAttr) {
+ if (auto boolAttr = mlir::dyn_cast<mlir::BoolAttr>(attr)) {
+ if (boolAttr.getValue())
+ return failure();
+ }
+ }
+ }
+
+ // Disable canonicalization if there are multiple wait segments
+ if (auto segmentsAttr = op.getWaitOperandsSegmentsAttr()) {
+ if (segmentsAttr.size() > 1)
+ return failure();
+ }
+
+ // Remove empty kernel environment.
+ // Preserve synchronization by creating acc.wait operation if needed.
+ if (!op.getWaitOperands().empty() || op.getWaitOnlyAttr())
+ rewriter.replaceOpWithNewOp<acc::WaitOp>(op, op.getWaitOperands(),
+ /*asyncOperand=*/Value(),
+ /*waitDevnum=*/Value(),
+ /*async=*/nullptr,
+ /*ifCond=*/Value());
+ else
+ rewriter.eraseOp(op);
+
+ return success();
+ }
+};
+
//===----------------------------------------------------------------------===//
// Recipe Region Helpers
//===----------------------------------------------------------------------===//
@@ -2691,6 +2750,15 @@ void acc::HostDataOp::getCanonicalizationPatterns(RewritePatternSet &results,
}
//===----------------------------------------------------------------------===//
+// KernelEnvironmentOp
+//===----------------------------------------------------------------------===//
+
+void acc::KernelEnvironmentOp::getCanonicalizationPatterns(
+ RewritePatternSet &results, MLIRContext *context) {
+ results.add<RemoveEmptyKernelEnvironment>(context);
+}
+
+//===----------------------------------------------------------------------===//
// LoopOp
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitData.cpp b/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitData.cpp
new file mode 100644
index 0000000..a99e484
--- /dev/null
+++ b/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitData.cpp
@@ -0,0 +1,880 @@
+//===- ACCImplicitData.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements the OpenACC specification for "Variables with
+// Implicitly Determined Data Attributes" (OpenACC 3.4 spec, section 2.6.2).
+//
+// Overview:
+// ---------
+// The pass automatically generates data clause operations for variables used
+// within OpenACC compute constructs (parallel, kernels, serial) that do not
+// already have explicit data clauses. The semantics follow these rules:
+//
+// 1. If there is a default(none) clause visible, no implicit data actions
+// apply.
+//
+// 2. An aggregate variable (arrays, derived types, etc.) will be treated as:
+// - In a present clause when default(present) is visible.
+// - In a copy clause otherwise.
+//
+// 3. A scalar variable will be treated as if it appears in:
+// - A copy clause if the compute construct is a kernels construct.
+// - A firstprivate clause otherwise (parallel, serial).
+//
+// Requirements:
+// -------------
+// To use this pass in a pipeline, the following requirements must be met:
+//
+// 1. Type Interface Implementation: Variables from the dialect being used
+// must implement one or both of the following MLIR interfaces:
+// `acc::MappableType` and/or `acc::PointerLikeType`
+//
+// These interfaces provide the necessary methods for the pass to:
+// - Determine variable type categories (scalar vs. aggregate)
+// - Generate appropriate bounds information
+// - Generate privatization recipes
+//
+// 2. Operation Interface Implementation: Operations that access partial
+// entities or create views should implement the following MLIR
+// interfaces: `acc::PartialEntityAccess` and/or
+// `mlir::ViewLikeOpInterface`
+//
+// These interfaces are used for proper data clause ordering, ensuring
+// that base entities are mapped before derived entities (e.g., a
+// struct is mapped before its fields, an array is mapped before
+// subarray views).
+//
+// 3. Analysis Registration (Optional): If custom behavior is needed for
+// variable name extraction or alias analysis, the dialect should
+// pre-register the `acc::OpenACCSupport` and `mlir::AliasAnalysis` analyses.
+//
+// If not registered, default behavior will be used.
+//
+// Implementation Details:
+// -----------------------
+// The pass performs the following operations:
+//
+// 1. Finds candidate variables which are live-in to the compute region and
+// are not already in a data clause or private clause.
+//
+// 2. Generates both data "entry" and "exit" clause operations that match
+// the intended action depending on variable type:
+// - copy -> acc.copyin (entry) + acc.copyout (exit)
+// - present -> acc.present (entry) + acc.delete (exit)
+// - firstprivate -> acc.firstprivate (entry only, no exit)
+//
+// 3. Ensures that default clause is taken into consideration by looking
+// through current construct and parent constructs to find the "visible
+// default clause".
+//
+// 4. Fixes up SSA value links so that uses in the acc region reference the
+// result of the newly created data clause operations.
+//
+// 5. When generating implicit data clause operations, it also adds variable
+// name information and marks them with the implicit flag.
+//
+// 6. Recipes are generated by calling the appropriate entrypoints in the
+// MappableType and PointerLikeType interfaces.
+//
+// 7. AliasAnalysis is used to determine if a variable is already covered by
+// an existing data clause (e.g., an interior pointer covered by its parent).
+//
+// Examples:
+// ---------
+//
+// Example 1: Scalar in parallel construct (implicit firstprivate)
+//
+// Before:
+// func.func @test() {
+// %scalar = memref.alloca() {acc.var_name = "x"} : memref<f32>
+// acc.parallel {
+// %val = memref.load %scalar[] : memref<f32>
+// acc.yield
+// }
+// }
+//
+// After:
+// func.func @test() {
+// %scalar = memref.alloca() {acc.var_name = "x"} : memref<f32>
+// %firstpriv = acc.firstprivate varPtr(%scalar : memref<f32>)
+// -> memref<f32> {implicit = true, name = "x"}
+// acc.parallel firstprivate(@recipe -> %firstpriv : memref<f32>) {
+// %val = memref.load %firstpriv[] : memref<f32>
+// acc.yield
+// }
+// }
+//
+// Example 2: Scalar in kernels construct (implicit copy)
+//
+// Before:
+// func.func @test() {
+// %scalar = memref.alloca() {acc.var_name = "n"} : memref<i32>
+// acc.kernels {
+// %val = memref.load %scalar[] : memref<i32>
+// acc.terminator
+// }
+// }
+//
+// After:
+// func.func @test() {
+// %scalar = memref.alloca() {acc.var_name = "n"} : memref<i32>
+// %copyin = acc.copyin varPtr(%scalar : memref<i32>) -> memref<i32>
+// {dataClause = #acc<data_clause acc_copy>,
+// implicit = true, name = "n"}
+// acc.kernels dataOperands(%copyin : memref<i32>) {
+// %val = memref.load %copyin[] : memref<i32>
+// acc.terminator
+// }
+// acc.copyout accPtr(%copyin : memref<i32>)
+// to varPtr(%scalar : memref<i32>)
+// {dataClause = #acc<data_clause acc_copy>,
+// implicit = true, name = "n"}
+// }
+//
+// Example 3: Array (aggregate) in parallel (implicit copy)
+//
+// Before:
+// func.func @test() {
+// %array = memref.alloca() {acc.var_name = "arr"} : memref<100xf32>
+// acc.parallel {
+// %c0 = arith.constant 0 : index
+// %val = memref.load %array[%c0] : memref<100xf32>
+// acc.yield
+// }
+// }
+//
+// After:
+// func.func @test() {
+// %array = memref.alloca() {acc.var_name = "arr"} : memref<100xf32>
+// %copyin = acc.copyin varPtr(%array : memref<100xf32>)
+// -> memref<100xf32>
+// {dataClause = #acc<data_clause acc_copy>,
+// implicit = true, name = "arr"}
+// acc.parallel dataOperands(%copyin : memref<100xf32>) {
+// %c0 = arith.constant 0 : index
+// %val = memref.load %copyin[%c0] : memref<100xf32>
+// acc.yield
+// }
+// acc.copyout accPtr(%copyin : memref<100xf32>)
+// to varPtr(%array : memref<100xf32>)
+// {dataClause = #acc<data_clause acc_copy>,
+// implicit = true, name = "arr"}
+// }
+//
+// Example 4: Array with default(present)
+//
+// Before:
+// func.func @test() {
+// %array = memref.alloca() {acc.var_name = "arr"} : memref<100xf32>
+// acc.parallel {
+// %c0 = arith.constant 0 : index
+// %val = memref.load %array[%c0] : memref<100xf32>
+// acc.yield
+// } attributes {defaultAttr = #acc<defaultvalue present>}
+// }
+//
+// After:
+// func.func @test() {
+// %array = memref.alloca() {acc.var_name = "arr"} : memref<100xf32>
+// %present = acc.present varPtr(%array : memref<100xf32>)
+// -> memref<100xf32>
+// {implicit = true, name = "arr"}
+// acc.parallel dataOperands(%present : memref<100xf32>)
+// attributes {defaultAttr = #acc<defaultvalue present>} {
+// %c0 = arith.constant 0 : index
+// %val = memref.load %present[%c0] : memref<100xf32>
+// acc.yield
+// }
+// acc.delete accPtr(%present : memref<100xf32>)
+// {dataClause = #acc<data_clause acc_present>,
+// implicit = true, name = "arr"}
+// }
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/Transforms/Passes.h"
+
+#include "mlir/Analysis/AliasAnalysis.h"
+#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Dialect/OpenACC/OpenACCUtils.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <type_traits>
+
+namespace mlir {
+namespace acc {
+#define GEN_PASS_DEF_ACCIMPLICITDATA
+#include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc"
+} // namespace acc
+} // namespace mlir
+
+#define DEBUG_TYPE "acc-implicit-data"
+
+using namespace mlir;
+
+namespace {
+
+class ACCImplicitData : public acc::impl::ACCImplicitDataBase<ACCImplicitData> {
+public:
+ using acc::impl::ACCImplicitDataBase<ACCImplicitData>::ACCImplicitDataBase;
+
+ void runOnOperation() override;
+
+private:
+ /// Collects all data clauses that dominate the compute construct.
+ /// Needed to determine if a variable is already covered by an existing data
+ /// clause.
+ SmallVector<Value> getDominatingDataClauses(Operation *computeConstructOp);
+
+ /// Looks through the `dominatingDataClauses` to find the original data clause
+ /// op for an alias. Returns nullptr if no original data clause op is found.
+ template <typename OpT>
+ Operation *getOriginalDataClauseOpForAlias(
+ Value var, OpBuilder &builder, OpT computeConstructOp,
+ const SmallVector<Value> &dominatingDataClauses);
+
+ /// Generates the appropriate `acc.copyin`, `acc.present`,`acc.firstprivate`,
+ /// etc. data clause op for a candidate variable.
+ template <typename OpT>
+ Operation *generateDataClauseOpForCandidate(
+ Value var, ModuleOp &module, OpBuilder &builder, OpT computeConstructOp,
+ const SmallVector<Value> &dominatingDataClauses,
+ const std::optional<acc::ClauseDefaultValue> &defaultClause);
+
+ /// Generates the implicit data ops for a compute construct.
+ template <typename OpT>
+ void generateImplicitDataOps(
+ ModuleOp &module, OpT computeConstructOp,
+ std::optional<acc::ClauseDefaultValue> &defaultClause);
+
+ /// Generates a private recipe for a variable.
+ acc::PrivateRecipeOp generatePrivateRecipe(ModuleOp &module, Value var,
+ Location loc, OpBuilder &builder,
+ acc::OpenACCSupport &accSupport);
+
+ /// Generates a firstprivate recipe for a variable.
+ acc::FirstprivateRecipeOp
+ generateFirstprivateRecipe(ModuleOp &module, Value var, Location loc,
+ OpBuilder &builder,
+ acc::OpenACCSupport &accSupport);
+
+ /// Generates recipes for a list of variables.
+ void generateRecipes(ModuleOp &module, OpBuilder &builder,
+ Operation *computeConstructOp,
+ const SmallVector<Value> &newOperands,
+ SmallVector<Attribute> &newRecipeSyms);
+};
+
+/// Determines if a variable is a candidate for implicit data mapping.
+/// Returns true if the variable is a candidate, false otherwise.
+static bool isCandidateForImplicitData(Value val, Region &accRegion) {
+ // Ensure the variable is an allowed type for data clause.
+ if (!acc::isPointerLikeType(val.getType()) &&
+ !acc::isMappableType(val.getType()))
+ return false;
+
+ // If this is already coming from a data clause, we do not need to generate
+ // another.
+ if (isa_and_nonnull<ACC_DATA_ENTRY_OPS>(val.getDefiningOp()))
+ return false;
+
+ // If this is only used by private clauses, it is not a real live-in.
+ if (acc::isOnlyUsedByPrivateClauses(val, accRegion))
+ return false;
+
+ return true;
+}
+
+SmallVector<Value>
+ACCImplicitData::getDominatingDataClauses(Operation *computeConstructOp) {
+ llvm::SmallSetVector<Value, 8> dominatingDataClauses;
+
+ llvm::TypeSwitch<Operation *>(computeConstructOp)
+ .Case<acc::ParallelOp, acc::KernelsOp, acc::SerialOp>([&](auto op) {
+ for (auto dataClause : op.getDataClauseOperands()) {
+ dominatingDataClauses.insert(dataClause);
+ }
+ })
+ .Default([](Operation *) {});
+
+ // Collect the data clauses from enclosing data constructs.
+ Operation *currParentOp = computeConstructOp->getParentOp();
+ while (currParentOp) {
+ if (isa<acc::DataOp>(currParentOp)) {
+ for (auto dataClause :
+ dyn_cast<acc::DataOp>(currParentOp).getDataClauseOperands()) {
+ dominatingDataClauses.insert(dataClause);
+ }
+ }
+ currParentOp = currParentOp->getParentOp();
+ }
+
+ // Find the enclosing function/subroutine
+ auto funcOp = computeConstructOp->getParentOfType<FunctionOpInterface>();
+ if (!funcOp)
+ return dominatingDataClauses.takeVector();
+
+ // Walk the function to find `acc.declare_enter`/`acc.declare_exit` pairs that
+ // dominate and post-dominate the compute construct and add their data
+ // clauses to the list.
+ auto &domInfo = this->getAnalysis<DominanceInfo>();
+ auto &postDomInfo = this->getAnalysis<PostDominanceInfo>();
+ funcOp->walk([&](acc::DeclareEnterOp declareEnterOp) {
+ if (domInfo.dominates(declareEnterOp.getOperation(), computeConstructOp)) {
+ // Collect all `acc.declare_exit` ops for this token.
+ SmallVector<acc::DeclareExitOp> exits;
+ for (auto *user : declareEnterOp.getToken().getUsers())
+ if (auto declareExit = dyn_cast<acc::DeclareExitOp>(user))
+ exits.push_back(declareExit);
+
+ // Only add clauses if every `acc.declare_exit` op post-dominates the
+ // compute construct.
+ if (!exits.empty() && llvm::all_of(exits, [&](acc::DeclareExitOp exitOp) {
+ return postDomInfo.postDominates(exitOp, computeConstructOp);
+ })) {
+ for (auto dataClause : declareEnterOp.getDataClauseOperands())
+ dominatingDataClauses.insert(dataClause);
+ }
+ }
+ });
+
+ return dominatingDataClauses.takeVector();
+}
+
+template <typename OpT>
+Operation *ACCImplicitData::getOriginalDataClauseOpForAlias(
+ Value var, OpBuilder &builder, OpT computeConstructOp,
+ const SmallVector<Value> &dominatingDataClauses) {
+ auto &aliasAnalysis = this->getAnalysis<AliasAnalysis>();
+ for (auto dataClause : dominatingDataClauses) {
+ if (auto *dataClauseOp = dataClause.getDefiningOp()) {
+ // Only accept clauses that guarantee that the alias is present.
+ if (isa<acc::CopyinOp, acc::CreateOp, acc::PresentOp, acc::NoCreateOp,
+ acc::DevicePtrOp>(dataClauseOp))
+ if (aliasAnalysis.alias(acc::getVar(dataClauseOp), var).isMust())
+ return dataClauseOp;
+ }
+ }
+ return nullptr;
+}
+
+// Generates bounds for variables that have unknown dimensions
+static void fillInBoundsForUnknownDimensions(Operation *dataClauseOp,
+ OpBuilder &builder) {
+
+ if (!acc::getBounds(dataClauseOp).empty())
+ // If bounds are already present, do not overwrite them.
+ return;
+
+ // For types that have unknown dimensions, attempt to generate bounds by
+ // relying on MappableType being able to extract it from the IR.
+ auto var = acc::getVar(dataClauseOp);
+ auto type = var.getType();
+ if (auto mappableTy = dyn_cast<acc::MappableType>(type)) {
+ if (mappableTy.hasUnknownDimensions()) {
+ TypeSwitch<Operation *>(dataClauseOp)
+ .Case<ACC_DATA_ENTRY_OPS, ACC_DATA_EXIT_OPS>([&](auto dataClauseOp) {
+ if (std::is_same_v<decltype(dataClauseOp), acc::DevicePtrOp>)
+ return;
+ OpBuilder::InsertionGuard guard(builder);
+ builder.setInsertionPoint(dataClauseOp);
+ auto bounds = mappableTy.generateAccBounds(var, builder);
+ if (!bounds.empty())
+ dataClauseOp.getBoundsMutable().assign(bounds);
+ });
+ }
+ }
+}
+
+acc::PrivateRecipeOp
+ACCImplicitData::generatePrivateRecipe(ModuleOp &module, Value var,
+ Location loc, OpBuilder &builder,
+ acc::OpenACCSupport &accSupport) {
+ auto type = var.getType();
+ std::string recipeName =
+ accSupport.getRecipeName(acc::RecipeKind::private_recipe, type, var);
+
+ // Check if recipe already exists
+ auto existingRecipe = module.lookupSymbol<acc::PrivateRecipeOp>(recipeName);
+ if (existingRecipe)
+ return existingRecipe;
+
+ // Set insertion point to module body in a scoped way
+ OpBuilder::InsertionGuard guard(builder);
+ builder.setInsertionPointToStart(module.getBody());
+
+ auto recipe =
+ acc::PrivateRecipeOp::createAndPopulate(builder, loc, recipeName, type);
+ if (!recipe.has_value())
+ return accSupport.emitNYI(loc, "implicit private"), nullptr;
+ return recipe.value();
+}
+
+acc::FirstprivateRecipeOp
+ACCImplicitData::generateFirstprivateRecipe(ModuleOp &module, Value var,
+ Location loc, OpBuilder &builder,
+ acc::OpenACCSupport &accSupport) {
+ auto type = var.getType();
+ std::string recipeName =
+ accSupport.getRecipeName(acc::RecipeKind::firstprivate_recipe, type, var);
+
+ // Check if recipe already exists
+ auto existingRecipe =
+ module.lookupSymbol<acc::FirstprivateRecipeOp>(recipeName);
+ if (existingRecipe)
+ return existingRecipe;
+
+ // Set insertion point to module body in a scoped way
+ OpBuilder::InsertionGuard guard(builder);
+ builder.setInsertionPointToStart(module.getBody());
+
+ auto recipe = acc::FirstprivateRecipeOp::createAndPopulate(builder, loc,
+ recipeName, type);
+ if (!recipe.has_value())
+ return accSupport.emitNYI(loc, "implicit firstprivate"), nullptr;
+ return recipe.value();
+}
+
+void ACCImplicitData::generateRecipes(ModuleOp &module, OpBuilder &builder,
+ Operation *computeConstructOp,
+ const SmallVector<Value> &newOperands,
+ SmallVector<Attribute> &newRecipeSyms) {
+ auto &accSupport = this->getAnalysis<acc::OpenACCSupport>();
+ for (auto var : newOperands) {
+ auto loc{var.getLoc()};
+ if (isa<acc::PrivateOp>(var.getDefiningOp())) {
+ auto recipe = generatePrivateRecipe(
+ module, acc::getVar(var.getDefiningOp()), loc, builder, accSupport);
+ if (recipe)
+ newRecipeSyms.push_back(SymbolRefAttr::get(module->getContext(),
+ recipe.getSymName().str()));
+ } else if (isa<acc::FirstprivateOp>(var.getDefiningOp())) {
+ auto recipe = generateFirstprivateRecipe(
+ module, acc::getVar(var.getDefiningOp()), loc, builder, accSupport);
+ if (recipe)
+ newRecipeSyms.push_back(SymbolRefAttr::get(module->getContext(),
+ recipe.getSymName().str()));
+ } else {
+ accSupport.emitNYI(var.getLoc(), "implicit reduction");
+ }
+ }
+}
+
+// Generates the data entry data op clause so that it adheres to OpenACC
+// rules as follows (line numbers and specification from OpenACC 3.4):
+// 1388 An aggregate variable will be treated as if it appears either:
+// 1389 - In a present clause if there is a default(present) clause visible at
+// the compute construct.
+// 1391 - In a copy clause otherwise.
+// 1392 A scalar variable will be treated as if it appears either:
+// 1393 - In a copy clause if the compute construct is a kernels construct.
+// 1394 - In a firstprivate clause otherwise.
+template <typename OpT>
+Operation *ACCImplicitData::generateDataClauseOpForCandidate(
+ Value var, ModuleOp &module, OpBuilder &builder, OpT computeConstructOp,
+ const SmallVector<Value> &dominatingDataClauses,
+ const std::optional<acc::ClauseDefaultValue> &defaultClause) {
+ auto &accSupport = this->getAnalysis<acc::OpenACCSupport>();
+ acc::VariableTypeCategory typeCategory =
+ acc::VariableTypeCategory::uncategorized;
+ if (auto mappableTy = dyn_cast<acc::MappableType>(var.getType())) {
+ typeCategory = mappableTy.getTypeCategory(var);
+ } else if (auto pointerLikeTy =
+ dyn_cast<acc::PointerLikeType>(var.getType())) {
+ typeCategory = pointerLikeTy.getPointeeTypeCategory(
+ cast<TypedValue<acc::PointerLikeType>>(var),
+ pointerLikeTy.getElementType());
+ }
+
+ bool isScalar =
+ acc::bitEnumContainsAny(typeCategory, acc::VariableTypeCategory::scalar);
+ bool isAnyAggregate = acc::bitEnumContainsAny(
+ typeCategory, acc::VariableTypeCategory::aggregate);
+ Location loc = computeConstructOp->getLoc();
+
+ Operation *op = nullptr;
+ op = getOriginalDataClauseOpForAlias(var, builder, computeConstructOp,
+ dominatingDataClauses);
+ if (op) {
+ if (isa<acc::NoCreateOp>(op))
+ return acc::NoCreateOp::create(builder, loc, var,
+ /*structured=*/true, /*implicit=*/true,
+ accSupport.getVariableName(var),
+ acc::getBounds(op));
+
+ if (isa<acc::DevicePtrOp>(op))
+ return acc::DevicePtrOp::create(builder, loc, var,
+ /*structured=*/true, /*implicit=*/true,
+ accSupport.getVariableName(var),
+ acc::getBounds(op));
+
+ // The original data clause op is a PresentOp, CopyinOp, or CreateOp,
+ // hence guaranteed to be present.
+ return acc::PresentOp::create(builder, loc, var,
+ /*structured=*/true, /*implicit=*/true,
+ accSupport.getVariableName(var),
+ acc::getBounds(op));
+ } else if (isScalar) {
+ if (enableImplicitReductionCopy &&
+ acc::isOnlyUsedByReductionClauses(var,
+ computeConstructOp->getRegion(0))) {
+ auto copyinOp =
+ acc::CopyinOp::create(builder, loc, var,
+ /*structured=*/true, /*implicit=*/true,
+ accSupport.getVariableName(var));
+ copyinOp.setDataClause(acc::DataClause::acc_reduction);
+ return copyinOp.getOperation();
+ }
+ if constexpr (std::is_same_v<OpT, acc::KernelsOp> ||
+ std::is_same_v<OpT, acc::KernelEnvironmentOp>) {
+ // Scalars are implicit copyin in kernels construct.
+ // We also do the same for acc.kernel_environment because semantics
+ // of user variable mappings should be applied while ACC construct exists
+ // and at this point we should only be dealing with unmapped variables
+ // that were made live-in by the compiler.
+ // TODO: This may be revisited.
+ auto copyinOp =
+ acc::CopyinOp::create(builder, loc, var,
+ /*structured=*/true, /*implicit=*/true,
+ accSupport.getVariableName(var));
+ copyinOp.setDataClause(acc::DataClause::acc_copy);
+ return copyinOp.getOperation();
+ } else {
+ // Scalars are implicit firstprivate in parallel and serial construct.
+ return acc::FirstprivateOp::create(builder, loc, var,
+ /*structured=*/true, /*implicit=*/true,
+ accSupport.getVariableName(var));
+ }
+ } else if (isAnyAggregate) {
+ Operation *newDataOp = nullptr;
+
+ // When default(present) is true, the implicit behavior is present.
+ if (defaultClause.has_value() &&
+ defaultClause.value() == acc::ClauseDefaultValue::Present) {
+ newDataOp = acc::PresentOp::create(builder, loc, var,
+ /*structured=*/true, /*implicit=*/true,
+ accSupport.getVariableName(var));
+ } else {
+ SmallVector<Value> bounds;
+ auto copyinOp =
+ acc::CopyinOp::create(builder, loc, var,
+ /*structured=*/true, /*implicit=*/true,
+ accSupport.getVariableName(var));
+ copyinOp.setDataClause(acc::DataClause::acc_copy);
+ newDataOp = copyinOp.getOperation();
+ }
+
+ return newDataOp;
+ } else {
+ // This is not a fatal error - for example when the element type is
+ // pointer type (aka we have a pointer of pointer), it is potentially a
+ // deep copy scenario which is not being handled here.
+ // Other types need to be canonicalized. Thus just log unhandled cases.
+ LLVM_DEBUG(llvm::dbgs()
+ << "Unhandled case for implicit data mapping " << var << "\n");
+ }
+ return nullptr;
+}
+
+// Ensures that result values from the acc data clause ops are used inside the
+// acc region. ie:
+// acc.kernels {
+// use %val
+// }
+// =>
+// %dev = acc.dataop %val
+// acc.kernels {
+// use %dev
+// }
+static void legalizeValuesInRegion(Region &accRegion,
+ SmallVector<Value> &newPrivateOperands,
+ SmallVector<Value> &newDataClauseOperands) {
+ for (Value dataClause :
+ llvm::concat<Value>(newDataClauseOperands, newPrivateOperands)) {
+ Value var = acc::getVar(dataClause.getDefiningOp());
+ replaceAllUsesInRegionWith(var, dataClause, accRegion);
+ }
+}
+
+// Adds the private operands and private recipes to the data construct
+// operation in a valid way (ensures that the index in the privatizationRecipes
+// array matches the position of the private operand).
+template <typename OpT>
+static void
+addNewPrivateOperands(OpT &accOp, const SmallVector<Value> &privateOperands,
+ const SmallVector<Attribute> &privateRecipeSyms) {
+ assert(privateOperands.size() == privateRecipeSyms.size());
+ if (privateOperands.empty())
+ return;
+
+ SmallVector<Attribute> completePrivateRecipesSyms;
+ SmallVector<Attribute> completeFirstprivateRecipesSyms;
+ SmallVector<Value> newPrivateOperands;
+ SmallVector<Value> newFirstprivateOperands;
+
+ // Collect all of the existing recipes since they are held in an attribute.
+ // To add to it, we need to create a brand new one.
+ if (accOp.getPrivatizationRecipes().has_value())
+ for (auto privatization : accOp.getPrivatizationRecipesAttr())
+ completePrivateRecipesSyms.push_back(privatization);
+ if (accOp.getFirstprivatizationRecipes().has_value())
+ for (auto privatization : accOp.getFirstprivatizationRecipesAttr())
+ completeFirstprivateRecipesSyms.push_back(privatization);
+
+ // Now separate between private and firstprivate operands.
+ for (auto [priv, privateRecipeSym] :
+ llvm::zip(privateOperands, privateRecipeSyms)) {
+ if (isa<acc::PrivateOp>(priv.getDefiningOp())) {
+ newPrivateOperands.push_back(priv);
+ completePrivateRecipesSyms.push_back(privateRecipeSym);
+ } else if (isa<acc::FirstprivateOp>(priv.getDefiningOp())) {
+ newFirstprivateOperands.push_back(priv);
+ completeFirstprivateRecipesSyms.push_back(privateRecipeSym);
+ } else {
+ llvm_unreachable("unhandled private operand");
+ }
+ }
+
+ // Append all of the new private operands to their appropriate list.
+ accOp.getPrivateOperandsMutable().append(newPrivateOperands);
+ accOp.getFirstprivateOperandsMutable().append(newFirstprivateOperands);
+
+ // Update the privatizationRecipes attributes to hold all of the new recipes.
+ if (!completePrivateRecipesSyms.empty())
+ accOp.setPrivatizationRecipesAttr(
+ ArrayAttr::get(accOp.getContext(), completePrivateRecipesSyms));
+ if (!completeFirstprivateRecipesSyms.empty())
+ accOp.setFirstprivatizationRecipesAttr(
+ ArrayAttr::get(accOp.getContext(), completeFirstprivateRecipesSyms));
+}
+
+static Operation *findDataExitOp(Operation *dataEntryOp) {
+ auto res = acc::getAccVar(dataEntryOp);
+ for (auto *user : res.getUsers())
+ if (isa<ACC_DATA_EXIT_OPS>(user))
+ return user;
+ return nullptr;
+}
+
+// Generates matching data exit operation as described in the acc dialect
+// for how data clauses are decomposed:
+// https://mlir.llvm.org/docs/Dialects/OpenACCDialect/#operation-categories
+// Key ones used here:
+// * acc {construct} copy -> acc.copyin (before region) + acc.copyout (after
+// region)
+// * acc {construct} present -> acc.present (before region) + acc.delete
+// (after region)
+static void
+generateDataExitOperations(OpBuilder &builder, Operation *accOp,
+ const SmallVector<Value> &newDataClauseOperands,
+ const SmallVector<Value> &sortedDataClauseOperands) {
+ builder.setInsertionPointAfter(accOp);
+ Value lastDataClause = nullptr;
+ for (auto dataEntry : llvm::reverse(sortedDataClauseOperands)) {
+ if (llvm::find(newDataClauseOperands, dataEntry) ==
+ newDataClauseOperands.end()) {
+ // If this is not a new data clause operand, we should not generate an
+ // exit operation for it.
+ lastDataClause = dataEntry;
+ continue;
+ }
+ if (lastDataClause)
+ if (auto *dataExitOp = findDataExitOp(lastDataClause.getDefiningOp()))
+ builder.setInsertionPointAfter(dataExitOp);
+ Operation *dataEntryOp = dataEntry.getDefiningOp();
+ if (isa<acc::CopyinOp>(dataEntryOp)) {
+ auto copyoutOp = acc::CopyoutOp::create(
+ builder, dataEntryOp->getLoc(), dataEntry, acc::getVar(dataEntryOp),
+ /*structured=*/true, /*implicit=*/true,
+ acc::getVarName(dataEntryOp).value(), acc::getBounds(dataEntryOp));
+ copyoutOp.setDataClause(acc::DataClause::acc_copy);
+ } else if (isa<acc::PresentOp, acc::NoCreateOp>(dataEntryOp)) {
+ auto deleteOp = acc::DeleteOp::create(
+ builder, dataEntryOp->getLoc(), dataEntry,
+ /*structured=*/true, /*implicit=*/true,
+ acc::getVarName(dataEntryOp).value(), acc::getBounds(dataEntryOp));
+ deleteOp.setDataClause(acc::getDataClause(dataEntryOp).value());
+ } else if (isa<acc::DevicePtrOp>(dataEntryOp)) {
+ // Do nothing.
+ } else {
+ llvm_unreachable("unhandled data exit");
+ }
+ lastDataClause = dataEntry;
+ }
+}
+
+/// Returns all base references of a value in order.
+/// So for example, if we have a reference to a struct field like
+/// s.f1.f2.f3, this will return <s, s.f1, s.f1.f2, s.f1.f2.f3>.
+/// Any intermediate casts/view-like operations are included in the
+/// chain as well.
+static SmallVector<Value> getBaseRefsChain(Value val) {
+ SmallVector<Value> baseRefs;
+ baseRefs.push_back(val);
+ while (true) {
+ Value prevVal = val;
+
+ val = acc::getBaseEntity(val);
+ if (val != baseRefs.front())
+ baseRefs.insert(baseRefs.begin(), val);
+
+ // If this is a view-like operation, it is effectively another
+ // view of the same entity so we should add it to the chain also.
+ if (auto viewLikeOp = val.getDefiningOp<ViewLikeOpInterface>()) {
+ val = viewLikeOp.getViewSource();
+ baseRefs.insert(baseRefs.begin(), val);
+ }
+
+ // Continue loop if we made any progress
+ if (val == prevVal)
+ break;
+ }
+
+ return baseRefs;
+}
+
+static void insertInSortedOrder(SmallVector<Value> &sortedDataClauseOperands,
+ Operation *newClause) {
+ auto *insertPos =
+ std::find_if(sortedDataClauseOperands.begin(),
+ sortedDataClauseOperands.end(), [&](Value dataClauseVal) {
+ // Get the base refs for the current clause we are looking
+ // at.
+ auto var = acc::getVar(dataClauseVal.getDefiningOp());
+ auto baseRefs = getBaseRefsChain(var);
+
+ // If the newClause is of a base ref of an existing clause,
+ // we should insert it right before the current clause.
+ // Thus return true to stop iteration when this is the
+ // case.
+ return std::find(baseRefs.begin(), baseRefs.end(),
+ acc::getVar(newClause)) != baseRefs.end();
+ });
+
+ if (insertPos != sortedDataClauseOperands.end()) {
+ newClause->moveBefore(insertPos->getDefiningOp());
+ sortedDataClauseOperands.insert(insertPos, acc::getAccVar(newClause));
+ } else {
+ sortedDataClauseOperands.push_back(acc::getAccVar(newClause));
+ }
+}
+
+template <typename OpT>
+void ACCImplicitData::generateImplicitDataOps(
+ ModuleOp &module, OpT computeConstructOp,
+ std::optional<acc::ClauseDefaultValue> &defaultClause) {
+ // Implicit data attributes are only applied if "[t]here is no default(none)
+ // clause visible at the compute construct."
+ if (defaultClause.has_value() &&
+ defaultClause.value() == acc::ClauseDefaultValue::None)
+ return;
+ assert(!defaultClause.has_value() ||
+ defaultClause.value() == acc::ClauseDefaultValue::Present);
+
+ // 1) Collect live-in values.
+ Region &accRegion = computeConstructOp->getRegion(0);
+ SetVector<Value> liveInValues;
+ getUsedValuesDefinedAbove(accRegion, liveInValues);
+
+ // 2) Run the filtering to find relevant pointers that need copied.
+ auto isCandidate{[&](Value val) -> bool {
+ return isCandidateForImplicitData(val, accRegion);
+ }};
+ auto candidateVars(
+ llvm::to_vector(llvm::make_filter_range(liveInValues, isCandidate)));
+ if (candidateVars.empty())
+ return;
+
+ // 3) Generate data clauses for the variables.
+ SmallVector<Value> newPrivateOperands;
+ SmallVector<Value> newDataClauseOperands;
+ OpBuilder builder(computeConstructOp);
+ if (!candidateVars.empty()) {
+ LLVM_DEBUG(llvm::dbgs() << "== Generating clauses for ==\n"
+ << computeConstructOp << "\n");
+ }
+ auto dominatingDataClauses = getDominatingDataClauses(computeConstructOp);
+ for (auto var : candidateVars) {
+ auto newDataClauseOp = generateDataClauseOpForCandidate(
+ var, module, builder, computeConstructOp, dominatingDataClauses,
+ defaultClause);
+ fillInBoundsForUnknownDimensions(newDataClauseOp, builder);
+ LLVM_DEBUG(llvm::dbgs() << "Generated data clause for " << var << ":\n"
+ << "\t" << *newDataClauseOp << "\n");
+ if (isa_and_nonnull<acc::PrivateOp, acc::FirstprivateOp, acc::ReductionOp>(
+ newDataClauseOp)) {
+ newPrivateOperands.push_back(acc::getAccVar(newDataClauseOp));
+ } else if (isa_and_nonnull<ACC_DATA_CLAUSE_OPS>(newDataClauseOp)) {
+ newDataClauseOperands.push_back(acc::getAccVar(newDataClauseOp));
+ dominatingDataClauses.push_back(acc::getAccVar(newDataClauseOp));
+ }
+ }
+
+ // 4) Legalize values in region (aka the uses in the region are the result
+ // of the data clause ops)
+ legalizeValuesInRegion(accRegion, newPrivateOperands, newDataClauseOperands);
+
+ SmallVector<Attribute> newPrivateRecipeSyms;
+ // 5) Generate private recipes which are required for properly attaching
+ // private operands.
+ if constexpr (!std::is_same_v<OpT, acc::KernelsOp> &&
+ !std::is_same_v<OpT, acc::KernelEnvironmentOp>)
+ generateRecipes(module, builder, computeConstructOp, newPrivateOperands,
+ newPrivateRecipeSyms);
+
+ // 6) Figure out insertion order for the new data clause operands.
+ SmallVector<Value> sortedDataClauseOperands(
+ computeConstructOp.getDataClauseOperands());
+ for (auto newClause : newDataClauseOperands)
+ insertInSortedOrder(sortedDataClauseOperands, newClause.getDefiningOp());
+
+ // 7) Generate the data exit operations.
+ generateDataExitOperations(builder, computeConstructOp, newDataClauseOperands,
+ sortedDataClauseOperands);
+
+ // 8) Add all of the new operands to the compute construct op.
+ assert(newPrivateOperands.size() == newPrivateRecipeSyms.size() &&
+ "sizes must match");
+ if constexpr (!std::is_same_v<OpT, acc::KernelsOp> &&
+ !std::is_same_v<OpT, acc::KernelEnvironmentOp>)
+ addNewPrivateOperands(computeConstructOp, newPrivateOperands,
+ newPrivateRecipeSyms);
+
+ computeConstructOp.getDataClauseOperandsMutable().assign(
+ sortedDataClauseOperands);
+}
+
+void ACCImplicitData::runOnOperation() {
+ ModuleOp module = this->getOperation();
+ module.walk([&](Operation *op) {
+ if (isa<ACC_COMPUTE_CONSTRUCT_OPS, acc::KernelEnvironmentOp>(op)) {
+ assert(op->getNumRegions() == 1 && "must have 1 region");
+
+ auto defaultClause = acc::getDefaultAttr(op);
+ llvm::TypeSwitch<Operation *, void>(op)
+ .Case<ACC_COMPUTE_CONSTRUCT_OPS, acc::KernelEnvironmentOp>(
+ [&](auto op) {
+ generateImplicitDataOps(module, op, defaultClause);
+ })
+ .Default([&](Operation *) {});
+ }
+ });
+}
+
+} // namespace
diff --git a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
index 7d93495..f8fff59 100644
--- a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
@@ -1,4 +1,5 @@
add_mlir_dialect_library(MLIROpenACCTransforms
+ ACCImplicitData.cpp
LegalizeDataValues.cpp
ADDITIONAL_HEADER_DIRS
@@ -14,7 +15,10 @@ add_mlir_dialect_library(MLIROpenACCTransforms
MLIROpenACCTypeInterfacesIncGen
LINK_LIBS PUBLIC
+ MLIRAnalysis
+ MLIROpenACCAnalysis
MLIROpenACCDialect
+ MLIROpenACCUtils
MLIRFuncDialect
MLIRIR
MLIRPass
diff --git a/mlir/lib/Dialect/Transform/Interfaces/TransformInterfaces.cpp b/mlir/lib/Dialect/Transform/Interfaces/TransformInterfaces.cpp
index 4f4620a..8859541 100644
--- a/mlir/lib/Dialect/Transform/Interfaces/TransformInterfaces.cpp
+++ b/mlir/lib/Dialect/Transform/Interfaces/TransformInterfaces.cpp
@@ -47,8 +47,6 @@ static bool happensBefore(Operation *a, Operation *b) {
// TransformState
//===----------------------------------------------------------------------===//
-constexpr const Value transform::TransformState::kTopLevelValue;
-
transform::TransformState::TransformState(
Region *region, Operation *payloadRoot,
const RaggedArray<MappedValue> &extraMappings,
diff --git a/mlir/lib/IR/BuiltinTypeInterfaces.cpp b/mlir/lib/IR/BuiltinTypeInterfaces.cpp
index ce076a3..031752b 100644
--- a/mlir/lib/IR/BuiltinTypeInterfaces.cpp
+++ b/mlir/lib/IR/BuiltinTypeInterfaces.cpp
@@ -34,8 +34,6 @@ unsigned FloatType::getFPMantissaWidth() {
// ShapedType
//===----------------------------------------------------------------------===//
-constexpr int64_t ShapedType::kDynamic;
-
int64_t ShapedType::getNumElements(ArrayRef<int64_t> shape) {
int64_t num = 1;
for (int64_t dim : shape) {
diff --git a/mlir/lib/IR/Operation.cpp b/mlir/lib/IR/Operation.cpp
index 8212d6d..bf8a918 100644
--- a/mlir/lib/IR/Operation.cpp
+++ b/mlir/lib/IR/Operation.cpp
@@ -375,9 +375,6 @@ llvm::hash_code Operation::hashProperties() {
// Operation Ordering
//===----------------------------------------------------------------------===//
-constexpr unsigned Operation::kInvalidOrderIdx;
-constexpr unsigned Operation::kOrderStride;
-
/// Given an operation 'other' that is within the same parent block, return
/// whether the current operation is before 'other' in the operation list
/// of the parent block.
diff --git a/mlir/lib/Interfaces/AlignmentAttrInterface.cpp b/mlir/lib/Interfaces/AlignmentAttrInterface.cpp
new file mode 100644
index 0000000..fe985ad
--- /dev/null
+++ b/mlir/lib/Interfaces/AlignmentAttrInterface.cpp
@@ -0,0 +1,13 @@
+//===- AlignmentAttrInterface.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Interfaces/AlignmentAttrInterface.h"
+
+using namespace mlir;
+
+#include "mlir/Interfaces/AlignmentAttrInterface.cpp.inc"
diff --git a/mlir/lib/Interfaces/CMakeLists.txt b/mlir/lib/Interfaces/CMakeLists.txt
index f96af02..ad3e2b6 100644
--- a/mlir/lib/Interfaces/CMakeLists.txt
+++ b/mlir/lib/Interfaces/CMakeLists.txt
@@ -1,4 +1,5 @@
set(LLVM_OPTIONAL_SOURCES
+ AlignmentAttrInterface.cpp
CallInterfaces.cpp
CastInterfaces.cpp
ControlFlowInterfaces.cpp
@@ -41,6 +42,7 @@ function(add_mlir_interface_library name)
endfunction(add_mlir_interface_library)
+add_mlir_interface_library(AlignmentAttrInterface)
add_mlir_interface_library(CallInterfaces)
add_mlir_interface_library(CastInterfaces)
add_mlir_interface_library(ControlFlowInterfaces)
diff --git a/mlir/lib/TableGen/CodeGenHelpers.cpp b/mlir/lib/TableGen/CodeGenHelpers.cpp
index d52d5e7..9ad031e 100644
--- a/mlir/lib/TableGen/CodeGenHelpers.cpp
+++ b/mlir/lib/TableGen/CodeGenHelpers.cpp
@@ -12,12 +12,26 @@
//===----------------------------------------------------------------------===//
#include "mlir/TableGen/CodeGenHelpers.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/TableGen/Argument.h"
+#include "mlir/TableGen/Attribute.h"
+#include "mlir/TableGen/Format.h"
#include "mlir/TableGen/Operator.h"
#include "mlir/TableGen/Pattern.h"
+#include "mlir/TableGen/Property.h"
+#include "mlir/TableGen/Region.h"
+#include "mlir/TableGen/Successor.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/TableGen/CodeGenHelpers.h"
+#include "llvm/TableGen/Error.h"
#include "llvm/TableGen/Record.h"
+#include <cassert>
+#include <optional>
+#include <string>
using namespace llvm;
using namespace mlir;
@@ -112,6 +126,55 @@ StringRef StaticVerifierFunctionEmitter::getRegionConstraintFn(
// Constraint Emission
//===----------------------------------------------------------------------===//
+/// Helper to generate a C++ string expression from a given message.
+/// Message can contain '{{...}}' placeholders that are substituted with
+/// C-expressions via tgfmt.
+std::string mlir::tblgen::buildErrorStreamingString(
+ StringRef message, const FmtContext &ctx, ErrorStreamType errorStreamType) {
+ std::string result;
+ raw_string_ostream os(result);
+
+ std::string msgStr = escapeString(message);
+ StringRef msg = msgStr;
+
+ // Split the message by '{{' and '}}' and build a streaming expression.
+ auto split = msg.split("{{");
+ os << split.first;
+ if (split.second.empty()) {
+ return msgStr;
+ }
+
+ if (errorStreamType == ErrorStreamType::InsideOpError)
+ os << "\")";
+ else
+ os << '"';
+
+ msg = split.second;
+ while (!msg.empty()) {
+ split = msg.split("}}");
+ StringRef var = split.first;
+ StringRef rest = split.second;
+
+ os << " << " << tgfmt(var, &ctx);
+
+ if (rest.empty())
+ break;
+
+ split = rest.split("{{");
+ if (split.second.empty() &&
+ errorStreamType == ErrorStreamType::InsideOpError) {
+ // To enable having part of string post, this adds a parenthesis before
+ // the last string segment to match the existing one.
+ os << " << (\"" << split.first;
+ } else {
+ os << " << \"" << split.first;
+ }
+ msg = split.second;
+ }
+
+ return os.str();
+}
+
/// Code templates for emitting type, attribute, successor, and region
/// constraints. Each of these templates require the following arguments:
///
@@ -224,22 +287,24 @@ static ::llvm::LogicalResult {0}(
void StaticVerifierFunctionEmitter::emitConstraints(
const ConstraintMap &constraints, StringRef selfName,
- const char *const codeTemplate) {
+ const char *const codeTemplate, ErrorStreamType errorStreamType) {
FmtContext ctx;
ctx.addSubst("_op", "*op").withSelf(selfName);
+
for (auto &it : constraints) {
os << formatv(codeTemplate, it.second,
tgfmt(it.first.getConditionTemplate(), &ctx),
- escapeString(it.first.getSummary()));
+ buildErrorStreamingString(it.first.getSummary(), ctx));
}
}
-
void StaticVerifierFunctionEmitter::emitTypeConstraints() {
- emitConstraints(typeConstraints, "type", typeConstraintCode);
+ emitConstraints(typeConstraints, "type", typeConstraintCode,
+ ErrorStreamType::InString);
}
void StaticVerifierFunctionEmitter::emitAttrConstraints() {
- emitConstraints(attrConstraints, "attr", attrConstraintCode);
+ emitConstraints(attrConstraints, "attr", attrConstraintCode,
+ ErrorStreamType::InString);
}
/// Unlike with the other helpers, this one has to substitute in the interface
@@ -251,17 +316,19 @@ void StaticVerifierFunctionEmitter::emitPropConstraints() {
auto propConstraint = cast<PropConstraint>(it.first);
os << formatv(propConstraintCode, it.second,
tgfmt(propConstraint.getConditionTemplate(), &ctx),
- escapeString(it.first.getSummary()),
+ buildErrorStreamingString(it.first.getSummary(), ctx),
propConstraint.getInterfaceType());
}
}
void StaticVerifierFunctionEmitter::emitSuccessorConstraints() {
- emitConstraints(successorConstraints, "successor", successorConstraintCode);
+ emitConstraints(successorConstraints, "successor", successorConstraintCode,
+ ErrorStreamType::InString);
}
void StaticVerifierFunctionEmitter::emitRegionConstraints() {
- emitConstraints(regionConstraints, "region", regionConstraintCode);
+ emitConstraints(regionConstraints, "region", regionConstraintCode,
+ ErrorStreamType::InString);
}
void StaticVerifierFunctionEmitter::emitPatternConstraints() {
@@ -270,13 +337,14 @@ void StaticVerifierFunctionEmitter::emitPatternConstraints() {
for (auto &it : typeConstraints) {
os << formatv(patternConstraintCode, it.second,
tgfmt(it.first.getConditionTemplate(), &ctx),
- escapeString(it.first.getSummary()), "::mlir::Type type");
+ buildErrorStreamingString(it.first.getSummary(), ctx),
+ "::mlir::Type type");
}
ctx.withSelf("attr");
for (auto &it : attrConstraints) {
os << formatv(patternConstraintCode, it.second,
tgfmt(it.first.getConditionTemplate(), &ctx),
- escapeString(it.first.getSummary()),
+ buildErrorStreamingString(it.first.getSummary(), ctx),
"::mlir::Attribute attr");
}
ctx.withSelf("prop");
@@ -291,7 +359,7 @@ void StaticVerifierFunctionEmitter::emitPatternConstraints() {
}
os << formatv(patternConstraintCode, it.second,
tgfmt(propConstraint.getConditionTemplate(), &ctx),
- escapeString(propConstraint.getSummary()),
+ buildErrorStreamingString(propConstraint.getSummary(), ctx),
Twine(interfaceType) + " prop");
}
}
diff --git a/mlir/test/Dialect/MemRef/invalid.mlir b/mlir/test/Dialect/MemRef/invalid.mlir
index 5ff2920..d10651f 100644
--- a/mlir/test/Dialect/MemRef/invalid.mlir
+++ b/mlir/test/Dialect/MemRef/invalid.mlir
@@ -992,6 +992,22 @@ func.func @invalid_store_alignment(%memref: memref<4xi32>, %val: i32) {
// -----
+func.func @invalid_alloc_alignment() {
+ // expected-error @below {{'memref.alloc' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
+ %0 = memref.alloc() {alignment = 3} : memref<4xf32>
+ return
+}
+
+// -----
+
+func.func @invalid_realloc_alignment(%src: memref<4xf32>) {
+ // expected-error @below {{'memref.realloc' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
+ %0 = memref.realloc %src {alignment = 7} : memref<4xf32> to memref<8xf32>
+ return
+}
+
+// -----
+
func.func @test_alloc_memref_map_rank_mismatch() {
^bb0:
// expected-error@+1 {{memref layout mismatch between rank and affine map: 2 != 1}}
diff --git a/mlir/test/Dialect/OpenACC/acc-implicit-data-reduction.mlir b/mlir/test/Dialect/OpenACC/acc-implicit-data-reduction.mlir
new file mode 100644
index 0000000..cff118b
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/acc-implicit-data-reduction.mlir
@@ -0,0 +1,109 @@
+// RUN: mlir-opt %s -acc-implicit-data=enable-implicit-reduction-copy=true -split-input-file | FileCheck %s --check-prefix=COPY
+// RUN: mlir-opt %s -acc-implicit-data=enable-implicit-reduction-copy=false -split-input-file | FileCheck %s --check-prefix=FIRSTPRIVATE
+
+// Test case: scalar reduction variable in parallel loop
+// When enable-implicit-reduction-copy=true: expect copyin/copyout for reduction variable
+// When enable-implicit-reduction-copy=false: expect firstprivate for reduction variable
+
+acc.reduction.recipe @reduction_add_memref_i32 : memref<i32> reduction_operator <add> init {
+^bb0(%arg0: memref<i32>):
+ %c0_i32 = arith.constant 0 : i32
+ %alloc = memref.alloca() : memref<i32>
+ memref.store %c0_i32, %alloc[] : memref<i32>
+ acc.yield %alloc : memref<i32>
+} combiner {
+^bb0(%arg0: memref<i32>, %arg1: memref<i32>):
+ %0 = memref.load %arg0[] : memref<i32>
+ %1 = memref.load %arg1[] : memref<i32>
+ %2 = arith.addi %0, %1 : i32
+ memref.store %2, %arg0[] : memref<i32>
+ acc.yield %arg0 : memref<i32>
+}
+
+func.func @test_reduction_implicit_copy() {
+ %c1_i32 = arith.constant 1 : i32
+ %c100_i32 = arith.constant 100 : i32
+ %c0_i32 = arith.constant 0 : i32
+ %r = memref.alloca() : memref<i32>
+ memref.store %c0_i32, %r[] : memref<i32>
+
+ acc.parallel {
+ %red_var = acc.reduction varPtr(%r : memref<i32>) -> memref<i32> {name = "r"}
+ acc.loop reduction(@reduction_add_memref_i32 -> %red_var : memref<i32>) control(%iv : i32) = (%c1_i32 : i32) to (%c100_i32 : i32) step (%c1_i32 : i32) {
+ %load = memref.load %red_var[] : memref<i32>
+ %add = arith.addi %load, %c1_i32 : i32
+ memref.store %add, %red_var[] : memref<i32>
+ acc.yield
+ } attributes {independent = [#acc.device_type<none>]}
+ acc.yield
+ }
+ return
+}
+
+// When enable-implicit-reduction-copy=true: expect copyin/copyout for reduction variable
+// COPY-LABEL: func.func @test_reduction_implicit_copy
+// COPY: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : memref<i32>) -> memref<i32> {dataClause = #acc<data_clause acc_reduction>, implicit = true, name = ""}
+// COPY: acc.copyout accPtr(%[[COPYIN]] : memref<i32>) to varPtr({{.*}} : memref<i32>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+
+// When enable-implicit-reduction-copy=false: expect firstprivate for reduction variable
+// FIRSTPRIVATE-LABEL: func.func @test_reduction_implicit_copy
+// FIRSTPRIVATE: acc.firstprivate varPtr({{.*}} : memref<i32>) -> memref<i32> {implicit = true, name = ""}
+// FIRSTPRIVATE-NOT: acc.copyin
+// FIRSTPRIVATE-NOT: acc.copyout
+
+// -----
+
+// Test case: reduction variable used both in loop and outside
+// Should be firstprivate regardless of the flag setting
+
+acc.reduction.recipe @reduction_add_memref_i32_2 : memref<i32> reduction_operator <add> init {
+^bb0(%arg0: memref<i32>):
+ %c0_i32 = arith.constant 0 : i32
+ %alloc = memref.alloca() : memref<i32>
+ memref.store %c0_i32, %alloc[] : memref<i32>
+ acc.yield %alloc : memref<i32>
+} combiner {
+^bb0(%arg0: memref<i32>, %arg1: memref<i32>):
+ %0 = memref.load %arg0[] : memref<i32>
+ %1 = memref.load %arg1[] : memref<i32>
+ %2 = arith.addi %0, %1 : i32
+ memref.store %2, %arg0[] : memref<i32>
+ acc.yield %arg0 : memref<i32>
+}
+
+func.func @test_reduction_with_usage_outside_loop() {
+ %c1_i32 = arith.constant 1 : i32
+ %c100_i32 = arith.constant 100 : i32
+ %c0_i32 = arith.constant 0 : i32
+ %r = memref.alloca() : memref<i32>
+ %out = memref.alloca() : memref<i32>
+ memref.store %c0_i32, %r[] : memref<i32>
+
+ %out_create = acc.create varPtr(%out : memref<i32>) -> memref<i32> {dataClause = #acc<data_clause acc_copyout>, name = "out"}
+ acc.parallel dataOperands(%out_create : memref<i32>) {
+ %red_var = acc.reduction varPtr(%r : memref<i32>) -> memref<i32> {name = "r"}
+ acc.loop reduction(@reduction_add_memref_i32_2 -> %red_var : memref<i32>) control(%iv : i32) = (%c1_i32 : i32) to (%c100_i32 : i32) step (%c1_i32 : i32) {
+ %load = memref.load %red_var[] : memref<i32>
+ %add = arith.addi %load, %c1_i32 : i32
+ memref.store %add, %red_var[] : memref<i32>
+ acc.yield
+ } attributes {independent = [#acc.device_type<none>]}
+ // out = r (usage of r outside the loop)
+ %final_r = memref.load %r[] : memref<i32>
+ memref.store %final_r, %out_create[] : memref<i32>
+ acc.yield
+ }
+ acc.copyout accPtr(%out_create : memref<i32>) to varPtr(%out : memref<i32>) {dataClause = #acc<data_clause acc_copyout>, name = "out"}
+ return
+}
+
+// In this case, r should be firstprivate regardless of the flag setting
+// because it's used outside the reduction context
+// COPY-LABEL: func.func @test_reduction_with_usage_outside_loop
+// COPY: acc.firstprivate varPtr({{.*}} : memref<i32>) -> memref<i32> {implicit = true, name = ""}
+// COPY-NOT: acc.copyin varPtr({{.*}} : memref<i32>) -> memref<i32> {{.*}} name = ""
+
+// FIRSTPRIVATE-LABEL: func.func @test_reduction_with_usage_outside_loop
+// FIRSTPRIVATE: acc.firstprivate varPtr({{.*}} : memref<i32>) -> memref<i32> {implicit = true, name = ""}
+// FIRSTPRIVATE-NOT: acc.copyin varPtr({{.*}} : memref<i32>) -> memref<i32> {{.*}} name = ""
+
diff --git a/mlir/test/Dialect/OpenACC/acc-implicit-data.mlir b/mlir/test/Dialect/OpenACC/acc-implicit-data.mlir
new file mode 100644
index 0000000..cf09c33
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/acc-implicit-data.mlir
@@ -0,0 +1,224 @@
+// RUN: mlir-opt %s -acc-implicit-data -split-input-file | FileCheck %s
+
+// -----
+
+// Test scalar in serial construct - should generate firstprivate
+func.func @test_scalar_in_serial() {
+ %alloc = memref.alloca() : memref<i64>
+ acc.serial {
+ %load = memref.load %alloc[] : memref<i64>
+ acc.yield
+ }
+ return
+}
+
+// CHECK-LABEL: func.func @test_scalar_in_serial
+// CHECK: acc.firstprivate varPtr({{.*}} : memref<i64>) -> memref<i64> {implicit = true, name = ""}
+
+// -----
+
+// Test scalar in parallel construct - should generate firstprivate
+func.func @test_scalar_in_parallel() {
+ %alloc = memref.alloca() : memref<f32>
+ acc.parallel {
+ %load = memref.load %alloc[] : memref<f32>
+ acc.yield
+ }
+ return
+}
+
+// CHECK-LABEL: func.func @test_scalar_in_parallel
+// CHECK: acc.firstprivate varPtr({{.*}} : memref<f32>) -> memref<f32> {implicit = true, name = ""}
+
+// -----
+
+// Test scalar in kernels construct - should generate copyin/copyout
+func.func @test_scalar_in_kernels() {
+ %alloc = memref.alloca() : memref<f64>
+ acc.kernels {
+ %load = memref.load %alloc[] : memref<f64>
+ acc.terminator
+ }
+ return
+}
+
+// CHECK-LABEL: func.func @test_scalar_in_kernels
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : memref<f64>) -> memref<f64> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : memref<f64>) to varPtr({{.*}} : memref<f64>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+
+// -----
+
+// Test scalar in parallel with default(none) - should NOT generate implicit data
+func.func @test_scalar_parallel_defaultnone() {
+ %alloc = memref.alloca() : memref<f32>
+ acc.parallel {
+ %load = memref.load %alloc[] : memref<f32>
+ acc.yield
+ } attributes {defaultAttr = #acc<defaultvalue none>}
+ return
+}
+
+// CHECK-LABEL: func.func @test_scalar_parallel_defaultnone
+// CHECK-NOT: acc.firstprivate
+// CHECK-NOT: acc.copyin
+
+// -----
+
+// Test array in parallel - should generate copyin/copyout
+func.func @test_array_in_parallel() {
+ %alloc = memref.alloca() : memref<10xf32>
+ acc.parallel {
+ %c0 = arith.constant 0 : index
+ %load = memref.load %alloc[%c0] : memref<10xf32>
+ acc.yield
+ }
+ return
+}
+
+// CHECK-LABEL: func.func @test_array_in_parallel
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : memref<10xf32>) -> memref<10xf32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : memref<10xf32>) to varPtr({{.*}} : memref<10xf32>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+
+// -----
+
+// Test array in kernels - should generate copyin/copyout
+func.func @test_array_in_kernels() {
+ %alloc = memref.alloca() : memref<20xi32>
+ acc.kernels {
+ %c0 = arith.constant 0 : index
+ %load = memref.load %alloc[%c0] : memref<20xi32>
+ acc.terminator
+ }
+ return
+}
+
+// CHECK-LABEL: func.func @test_array_in_kernels
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : memref<20xi32>) -> memref<20xi32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : memref<20xi32>) to varPtr({{.*}} : memref<20xi32>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+
+// -----
+
+// Test array with default(present) - should generate present
+func.func @test_array_parallel_defaultpresent() {
+ %alloc = memref.alloca() : memref<10xf32>
+ acc.parallel {
+ %c0 = arith.constant 0 : index
+ %load = memref.load %alloc[%c0] : memref<10xf32>
+ acc.yield
+ } attributes {defaultAttr = #acc<defaultvalue present>}
+ return
+}
+
+// CHECK-LABEL: func.func @test_array_parallel_defaultpresent
+// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : memref<10xf32>) -> memref<10xf32> {implicit = true, name = ""}
+// CHECK: acc.delete accPtr(%[[PRESENT]] : memref<10xf32>) {dataClause = #acc<data_clause acc_present>, implicit = true, name = ""}
+
+// -----
+
+// Test scalar with default(present) - should still generate firstprivate (scalars ignore default(present))
+func.func @test_scalar_parallel_defaultpresent() {
+ %alloc = memref.alloca() : memref<f32>
+ acc.parallel {
+ %load = memref.load %alloc[] : memref<f32>
+ acc.yield
+ } attributes {defaultAttr = #acc<defaultvalue present>}
+ return
+}
+
+// CHECK-LABEL: func.func @test_scalar_parallel_defaultpresent
+// CHECK: acc.firstprivate varPtr({{.*}} : memref<f32>) -> memref<f32> {implicit = true, name = ""}
+
+// -----
+
+// Test multidimensional array
+func.func @test_multidim_array_in_parallel() {
+ %alloc = memref.alloca() : memref<8x16xf32>
+ acc.parallel {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %load = memref.load %alloc[%c0, %c1] : memref<8x16xf32>
+ acc.yield
+ }
+ return
+}
+
+// CHECK-LABEL: func.func @test_multidim_array_in_parallel
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : memref<8x16xf32>) -> memref<8x16xf32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : memref<8x16xf32>) to varPtr({{.*}} : memref<8x16xf32>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+
+// -----
+
+// Test dynamic size array
+func.func @test_dynamic_array(%size: index) {
+ %alloc = memref.alloca(%size) : memref<?xf64>
+ acc.parallel {
+ %c0 = arith.constant 0 : index
+ %load = memref.load %alloc[%c0] : memref<?xf64>
+ acc.yield
+ }
+ return
+}
+
+// CHECK-LABEL: func.func @test_dynamic_array
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : memref<?xf64>) -> memref<?xf64> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : memref<?xf64>) to varPtr({{.*}} : memref<?xf64>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+
+// -----
+
+// Test variable with explicit data clause - implicit should recognize it
+func.func @test_with_explicit_copyin() {
+ %alloc = memref.alloca() : memref<100xf32>
+ %copyin = acc.copyin varPtr(%alloc : memref<100xf32>) -> memref<100xf32> {name = "explicit"}
+ acc.parallel dataOperands(%copyin : memref<100xf32>) {
+ %c0 = arith.constant 0 : index
+ %load = memref.load %alloc[%c0] : memref<100xf32>
+ acc.yield
+ }
+ acc.copyout accPtr(%copyin : memref<100xf32>) to varPtr(%alloc : memref<100xf32>) {name = "explicit"}
+ return
+}
+
+// CHECK-LABEL: func.func @test_with_explicit_copyin
+// CHECK: acc.present varPtr({{.*}} : memref<100xf32>) -> memref<100xf32> {implicit = true, name = ""}
+
+// -----
+
+// Test multiple variables
+func.func @test_multiple_variables() {
+ %alloc1 = memref.alloca() : memref<f32>
+ %alloc2 = memref.alloca() : memref<10xi32>
+ acc.parallel {
+ %load1 = memref.load %alloc1[] : memref<f32>
+ %c0 = arith.constant 0 : index
+ %load2 = memref.load %alloc2[%c0] : memref<10xi32>
+ acc.yield
+ }
+ return
+}
+
+// CHECK-LABEL: func.func @test_multiple_variables
+// CHECK: acc.firstprivate varPtr({{.*}} : memref<f32>) -> memref<f32> {implicit = true, name = ""}
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : memref<10xi32>) -> memref<10xi32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : memref<10xi32>) to varPtr({{.*}} : memref<10xi32>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+
+// -----
+
+// Test memref.view aliasing - view of explicitly copied buffer should generate present
+func.func @test_memref_view(%size: index) {
+ %c0 = arith.constant 0 : index
+ %buffer = memref.alloca(%size) : memref<?xi8>
+ %copyin = acc.copyin varPtr(%buffer : memref<?xi8>) -> memref<?xi8> {name = "buffer"}
+ %view = memref.view %buffer[%c0][] : memref<?xi8> to memref<8x64xf32>
+ acc.kernels dataOperands(%copyin : memref<?xi8>) {
+ %c0_0 = arith.constant 0 : index
+ %c0_1 = arith.constant 0 : index
+ %load = memref.load %view[%c0_0, %c0_1] : memref<8x64xf32>
+ acc.terminator
+ }
+ acc.copyout accPtr(%copyin : memref<?xi8>) to varPtr(%buffer : memref<?xi8>) {name = "buffer"}
+ return
+}
+
+// CHECK-LABEL: func.func @test_memref_view
+// CHECK: acc.present varPtr({{.*}} : memref<8x64xf32>) -> memref<8x64xf32> {implicit = true, name = ""}
+
diff --git a/mlir/test/Dialect/OpenACC/canonicalize.mlir b/mlir/test/Dialect/OpenACC/canonicalize.mlir
index fdc8e6b..38d3df3 100644
--- a/mlir/test/Dialect/OpenACC/canonicalize.mlir
+++ b/mlir/test/Dialect/OpenACC/canonicalize.mlir
@@ -219,3 +219,30 @@ func.func @update_unnecessary_computations(%x: memref<i32>) {
// CHECK-LABEL: func.func @update_unnecessary_computations
// CHECK-NOT: acc.atomic.update
// CHECK: acc.atomic.write
+
+// -----
+
+func.func @kernel_environment_canonicalization(%q1: i32, %q2: i32, %q3: i32) {
+ // Empty kernel_environment (no wait) - should be removed
+ acc.kernel_environment {
+ }
+
+ acc.kernel_environment wait({%q1 : i32, %q2 : i32}) {
+ }
+
+ acc.kernel_environment wait {
+ }
+
+ acc.kernel_environment wait({%q3 : i32} [#acc.device_type<nvidia>]) {
+ }
+
+ return
+}
+
+// CHECK-LABEL: func.func @kernel_environment_canonicalization
+// CHECK-SAME: ([[Q1:%.*]]: i32, [[Q2:%.*]]: i32, [[Q3:%.*]]: i32)
+// CHECK-NOT: acc.kernel_environment wait({{.*}}[#acc.device_type<none>])
+// CHECK: acc.wait([[Q1]], [[Q2]] : i32, i32)
+// CHECK: acc.wait{{$}}
+// CHECK: acc.kernel_environment wait({{.*}}[#acc.device_type<nvidia>])
+// CHECK: return
diff --git a/mlir/test/mlir-tblgen/constraint-unique.td b/mlir/test/mlir-tblgen/constraint-unique.td
index d51e1a5f..3f2e5cd 100644
--- a/mlir/test/mlir-tblgen/constraint-unique.td
+++ b/mlir/test/mlir-tblgen/constraint-unique.td
@@ -16,7 +16,7 @@ def AType : Type<ATypePred, "a type">;
def OtherType : Type<ATypePred, "another type">;
def AnAttrPred : CPred<"attrPred($_self, $_op)">;
-def AnAttr : Attr<AnAttrPred, "an attribute">;
+def AnAttr : Attr<AnAttrPred, "an attribute (got {{reformat($_self)}})">;
def OtherAttr : Attr<AnAttrPred, "another attribute">;
def ASuccessorPred : CPred<"successorPred($_self, $_op)">;
@@ -24,7 +24,7 @@ def ASuccessor : Successor<ASuccessorPred, "a successor">;
def OtherSuccessor : Successor<ASuccessorPred, "another successor">;
def ARegionPred : CPred<"regionPred($_self, $_op)">;
-def ARegion : Region<ARegionPred, "a region">;
+def ARegion : Region<ARegionPred, "a region ({{find(foo)}})">;
def OtherRegion : Region<ARegionPred, "another region">;
// OpA and OpB have the same type, attribute, successor, and region constraints.
@@ -71,10 +71,10 @@ def OpC : NS_Op<"op_c"> {
// CHECK: static ::llvm::LogicalResult [[$A_ATTR_CONSTRAINT:__mlir_ods_local_attr_constraint.*]](
// CHECK: if (attr && !((attrPred(attr, *op))))
// CHECK-NEXT: return emitError() << "attribute '" << attrName
-// CHECK-NEXT: << "' failed to satisfy constraint: an attribute";
+// CHECK-NEXT: << "' failed to satisfy constraint: an attribute (got " << reformat(attr) << ")";
/// Test that duplicate attribute constraint was not generated.
-// CHECK-NOT: << "' failed to satisfy constraint: an attribute";
+// CHECK-NOT: << "' failed to satisfy constraint: an attribute
/// Test that a attribute constraint with a different description was generated.
// CHECK: static ::llvm::LogicalResult [[$O_ATTR_CONSTRAINT:__mlir_ods_local_attr_constraint.*]](
@@ -103,7 +103,7 @@ def OpC : NS_Op<"op_c"> {
// CHECK: if (!((regionPred(region, *op)))) {
// CHECK-NEXT: return op->emitOpError("region #") << regionIndex
// CHECK-NEXT: << (regionName.empty() ? " " : " ('" + regionName + "') ")
-// CHECK-NEXT: << "failed to verify constraint: a region";
+// CHECK-NEXT: << "failed to verify constraint: a region (" << find(foo) << ")";
/// Test that duplicate region constraint was not generated.
// CHECK-NOT: << "failed to verify constraint: a region";
diff --git a/mlir/test/mlir-tblgen/op-attribute.td b/mlir/test/mlir-tblgen/op-attribute.td
index 549830e..a3cb9a4 100644
--- a/mlir/test/mlir-tblgen/op-attribute.td
+++ b/mlir/test/mlir-tblgen/op-attribute.td
@@ -69,19 +69,19 @@ def AOp : NS_Op<"a_op", []> {
// DEF: ::llvm::LogicalResult AOpAdaptor::verify
// DEF-NEXT: auto tblgen_aAttr = getProperties().aAttr; (void)tblgen_aAttr;
-// DEF-NEXT: if (!tblgen_aAttr) return emitError(loc, "'test.a_op' op ""requires attribute 'aAttr'");
+// DEF-NEXT: if (!tblgen_aAttr) return emitError(loc, "'test.a_op' op requires attribute 'aAttr'");
// DEF-NEXT: auto tblgen_bAttr = getProperties().bAttr; (void)tblgen_bAttr;
// DEF-NEXT: auto tblgen_cAttr = getProperties().cAttr; (void)tblgen_cAttr;
// DEF-NEXT: auto tblgen_dAttr = getProperties().dAttr; (void)tblgen_dAttr;
// DEF: if (tblgen_aAttr && !((some-condition)))
-// DEF-NEXT: return emitError(loc, "'test.a_op' op ""attribute 'aAttr' failed to satisfy constraint: some attribute kind");
+// DEF-NEXT: return emitError(loc, "'test.a_op' op attribute 'aAttr' failed to satisfy constraint: some attribute kind");
// DEF: if (tblgen_bAttr && !((some-condition)))
-// DEF-NEXT: return emitError(loc, "'test.a_op' op ""attribute 'bAttr' failed to satisfy constraint: some attribute kind");
+// DEF-NEXT: return emitError(loc, "'test.a_op' op attribute 'bAttr' failed to satisfy constraint: some attribute kind");
// DEF: if (tblgen_cAttr && !((some-condition)))
-// DEF-NEXT: return emitError(loc, "'test.a_op' op ""attribute 'cAttr' failed to satisfy constraint: some attribute kind");
+// DEF-NEXT: return emitError(loc, "'test.a_op' op attribute 'cAttr' failed to satisfy constraint: some attribute kind");
// DEF: if (tblgen_dAttr && !((some-condition)))
-// DEF-NEXT: return emitError(loc, "'test.a_op' op ""attribute 'dAttr' failed to satisfy constraint: some attribute kind");
+// DEF-NEXT: return emitError(loc, "'test.a_op' op attribute 'dAttr' failed to satisfy constraint: some attribute kind");
// Test getter methods
// ---
@@ -219,13 +219,13 @@ def AgetOp : Op<Test2_Dialect, "a_get_op", []> {
// DEF: ::llvm::LogicalResult AgetOpAdaptor::verify
// DEF: auto tblgen_aAttr = getProperties().aAttr; (void)tblgen_aAttr;
-// DEF: if (!tblgen_aAttr) return emitError(loc, "'test2.a_get_op' op ""requires attribute 'aAttr'");
+// DEF: if (!tblgen_aAttr) return emitError(loc, "'test2.a_get_op' op requires attribute 'aAttr'");
// DEF: auto tblgen_bAttr = getProperties().bAttr; (void)tblgen_bAttr;
// DEF: auto tblgen_cAttr = getProperties().cAttr; (void)tblgen_cAttr;
// DEF: if (tblgen_bAttr && !((some-condition)))
-// DEF-NEXT: return emitError(loc, "'test2.a_get_op' op ""attribute 'bAttr' failed to satisfy constraint: some attribute kind");
+// DEF-NEXT: return emitError(loc, "'test2.a_get_op' op attribute 'bAttr' failed to satisfy constraint: some attribute kind");
// DEF: if (tblgen_cAttr && !((some-condition)))
-// DEF-NEXT: return emitError(loc, "'test2.a_get_op' op ""attribute 'cAttr' failed to satisfy constraint: some attribute kind");
+// DEF-NEXT: return emitError(loc, "'test2.a_get_op' op attribute 'cAttr' failed to satisfy constraint: some attribute kind");
// Test getter methods
// ---
diff --git a/mlir/test/mlir-tblgen/op-properties-predicates.td b/mlir/test/mlir-tblgen/op-properties-predicates.td
index af09ee7..7cc9633 100644
--- a/mlir/test/mlir-tblgen/op-properties-predicates.td
+++ b/mlir/test/mlir-tblgen/op-properties-predicates.td
@@ -74,7 +74,7 @@ def OpWithPredicates : NS_Op<"op_with_predicates"> {
// Note: comprehensive emission of verifiers is tested in verifyINvariantsImpl() below
// CHECK: int64_t tblgen_scalar = this->getScalar();
// CHECK: if (!((tblgen_scalar >= 0)))
-// CHECK: return emitError(loc, "'test.op_with_predicates' op ""property 'scalar' failed to satisfy constraint: non-negative int64_t");
+// CHECK: return emitError(loc, "'test.op_with_predicates' op property 'scalar' failed to satisfy constraint: non-negative int64_t");
// CHECK-LABEL: OpWithPredicates::verifyInvariantsImpl()
// Note: for test readability, we capture [[maybe_unused]] into the variable maybe_unused
diff --git a/mlir/test/mlir-tblgen/predicate.td b/mlir/test/mlir-tblgen/predicate.td
index c1fcd3f..41e041f 100644
--- a/mlir/test/mlir-tblgen/predicate.td
+++ b/mlir/test/mlir-tblgen/predicate.td
@@ -55,7 +55,7 @@ def OpF : NS_Op<"op_for_int_min_val", []> {
// CHECK-LABEL: OpFAdaptor::verify
// CHECK: (::llvm::cast<::mlir::IntegerAttr>(tblgen_attr).getInt() >= 10)
-// CHECK-NEXT: "attribute 'attr' failed to satisfy constraint: 32-bit signless integer attribute whose minimum value is 10"
+// CHECK-NEXT: attribute 'attr' failed to satisfy constraint: 32-bit signless integer attribute whose minimum value is 10"
def OpFX : NS_Op<"op_for_int_max_val", []> {
let arguments = (ins ConfinedAttr<I32Attr, [IntMaxValue<10>]>:$attr);
@@ -63,7 +63,7 @@ def OpFX : NS_Op<"op_for_int_max_val", []> {
// CHECK-LABEL: OpFXAdaptor::verify
// CHECK: (::llvm::cast<::mlir::IntegerAttr>(tblgen_attr).getInt() <= 10)
-// CHECK-NEXT: "attribute 'attr' failed to satisfy constraint: 32-bit signless integer attribute whose maximum value is 10"
+// CHECK-NEXT: attribute 'attr' failed to satisfy constraint: 32-bit signless integer attribute whose maximum value is 10"
def OpG : NS_Op<"op_for_arr_min_count", []> {
let arguments = (ins ConfinedAttr<ArrayAttr, [ArrayMinCount<8>]>:$attr);
@@ -71,7 +71,7 @@ def OpG : NS_Op<"op_for_arr_min_count", []> {
// CHECK-LABEL: OpGAdaptor::verify
// CHECK: (::llvm::cast<::mlir::ArrayAttr>(tblgen_attr).size() >= 8)
-// CHECK-NEXT: "attribute 'attr' failed to satisfy constraint: array attribute with at least 8 elements"
+// CHECK-NEXT: attribute 'attr' failed to satisfy constraint: array attribute with at least 8 elements"
def OpH : NS_Op<"op_for_arr_value_at_index", []> {
let arguments = (ins ConfinedAttr<ArrayAttr, [IntArrayNthElemEq<0, 8>]>:$attr);
@@ -79,7 +79,7 @@ def OpH : NS_Op<"op_for_arr_value_at_index", []> {
// CHECK-LABEL: OpHAdaptor::verify
// CHECK: (((::llvm::cast<::mlir::ArrayAttr>(tblgen_attr).size() > 0)) && ((::llvm::cast<::mlir::IntegerAttr>(::llvm::cast<::mlir::ArrayAttr>(tblgen_attr)[0]).getInt() == 8)))))
-// CHECK-NEXT: "attribute 'attr' failed to satisfy constraint: array attribute whose 0-th element must be 8"
+// CHECK-NEXT: attribute 'attr' failed to satisfy constraint: array attribute whose 0-th element must be 8"
def OpI: NS_Op<"op_for_arr_min_value_at_index", []> {
let arguments = (ins ConfinedAttr<ArrayAttr, [IntArrayNthElemMinValue<0, 8>]>:$attr);
@@ -87,7 +87,7 @@ def OpI: NS_Op<"op_for_arr_min_value_at_index", []> {
// CHECK-LABEL: OpIAdaptor::verify
// CHECK: (((::llvm::cast<::mlir::ArrayAttr>(tblgen_attr).size() > 0)) && ((::llvm::cast<::mlir::IntegerAttr>(::llvm::cast<::mlir::ArrayAttr>(tblgen_attr)[0]).getInt() >= 8)))))
-// CHECK-NEXT: "attribute 'attr' failed to satisfy constraint: array attribute whose 0-th element must be at least 8"
+// CHECK-NEXT: attribute 'attr' failed to satisfy constraint: array attribute whose 0-th element must be at least 8"
def OpJ: NS_Op<"op_for_arr_max_value_at_index", []> {
let arguments = (ins ConfinedAttr<ArrayAttr, [IntArrayNthElemMaxValue<0, 8>]>:$attr);
@@ -95,7 +95,7 @@ def OpJ: NS_Op<"op_for_arr_max_value_at_index", []> {
// CHECK-LABEL: OpJAdaptor::verify
// CHECK: (((::llvm::cast<::mlir::ArrayAttr>(tblgen_attr).size() > 0)) && ((::llvm::cast<::mlir::IntegerAttr>(::llvm::cast<::mlir::ArrayAttr>(tblgen_attr)[0]).getInt() <= 8)))))
-// CHECK-NEXT: "attribute 'attr' failed to satisfy constraint: array attribute whose 0-th element must be at most 8"
+// CHECK-NEXT: attribute 'attr' failed to satisfy constraint: array attribute whose 0-th element must be at most 8"
def OpK: NS_Op<"op_for_arr_in_range_at_index", []> {
let arguments = (ins ConfinedAttr<ArrayAttr, [IntArrayNthElemInRange<0, 4, 8>]>:$attr);
@@ -103,7 +103,7 @@ def OpK: NS_Op<"op_for_arr_in_range_at_index", []> {
// CHECK-LABEL: OpKAdaptor::verify
// CHECK: (((::llvm::cast<::mlir::ArrayAttr>(tblgen_attr).size() > 0)) && ((::llvm::cast<::mlir::IntegerAttr>(::llvm::cast<::mlir::ArrayAttr>(tblgen_attr)[0]).getInt() >= 4)) && ((::llvm::cast<::mlir::IntegerAttr>(::llvm::cast<::mlir::ArrayAttr>(tblgen_attr)[0]).getInt() <= 8)))))
-// CHECK-NEXT: "attribute 'attr' failed to satisfy constraint: array attribute whose 0-th element must be at least 4 and at most 8"
+// CHECK-NEXT: attribute 'attr' failed to satisfy constraint: array attribute whose 0-th element must be at least 4 and at most 8"
def OpL: NS_Op<"op_for_TCopVTEtAreSameAt", [
PredOpTrait<"operands indexed at 0, 2, 3 should all have "
@@ -121,7 +121,7 @@ def OpL: NS_Op<"op_for_TCopVTEtAreSameAt", [
// CHECK: ::llvm::all_equal(::llvm::map_range(
// CHECK-SAME: ::mlir::ArrayRef<unsigned>({0, 2, 3}),
// CHECK-SAME: [this](unsigned i) { return getElementTypeOrSelf(this->getOperand(i)); }))
-// CHECK: "failed to verify that operands indexed at 0, 2, 3 should all have the same type"
+// CHECK: failed to verify that operands indexed at 0, 2, 3 should all have the same type"
def OpM : NS_Op<"op_for_AnyTensorOf", []> {
let arguments = (ins TensorOf<[F32, I32]>:$x);
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 4d9b1b2..3b10842 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -17,6 +17,7 @@
#include "OpGenHelpers.h"
#include "mlir/TableGen/Argument.h"
#include "mlir/TableGen/Attribute.h"
+#include "mlir/TableGen/Builder.h"
#include "mlir/TableGen/Class.h"
#include "mlir/TableGen/CodeGenHelpers.h"
#include "mlir/TableGen/Format.h"
@@ -24,16 +25,24 @@
#include "mlir/TableGen/Interfaces.h"
#include "mlir/TableGen/Operator.h"
#include "mlir/TableGen/Property.h"
+#include "mlir/TableGen/Region.h"
#include "mlir/TableGen/SideEffects.h"
+#include "mlir/TableGen/Successor.h"
#include "mlir/TableGen/Trait.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Sequence.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/Signals.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/TableGen/CodeGenHelpers.h"
@@ -380,9 +389,8 @@ public:
Formatter emitErrorPrefix() const {
return [this](raw_ostream &os) -> raw_ostream & {
if (emitForOp)
- return os << "emitOpError(";
- return os << formatv("emitError(loc, \"'{0}' op \"",
- op.getOperationName());
+ return os << "emitOpError(\"";
+ return os << formatv("emitError(loc, \"'{0}' op ", op.getOperationName());
};
}
@@ -940,7 +948,7 @@ genAttributeVerifier(const OpOrAdaptorHelper &emitHelper, FmtContext &ctx,
// {4}: Attribute/constraint description.
const char *const verifyAttrInline = R"(
if ({0} && !({1}))
- return {2}"attribute '{3}' failed to satisfy constraint: {4}");
+ return {2}attribute '{3}' failed to satisfy constraint: {4}");
)";
// Verify the attribute using a uniqued constraint. Can only be used within
// the context of an op.
@@ -993,10 +1001,11 @@ while (true) {{
(constraintFn = staticVerifierEmitter.getAttrConstraintFn(attr))) {
body << formatv(verifyAttrUnique, *constraintFn, varName, attrName);
} else {
- body << formatv(verifyAttrInline, varName,
- tgfmt(condition, &ctx.withSelf(varName)),
- emitHelper.emitErrorPrefix(), attrName,
- escapeString(attr.getSummary()));
+ body << formatv(
+ verifyAttrInline, varName, tgfmt(condition, &ctx.withSelf(varName)),
+ emitHelper.emitErrorPrefix(), attrName,
+ buildErrorStreamingString(attr.getSummary(), ctx.withSelf(varName),
+ ErrorStreamType::InsideOpError));
}
};
@@ -1017,7 +1026,7 @@ while (true) {{
it.first);
if (metadata.isRequired)
body << formatv(
- "if (!tblgen_{0}) return {1}\"requires attribute '{0}'\");\n",
+ "if (!tblgen_{0}) return {1}requires attribute '{0}'\");\n",
it.first, emitHelper.emitErrorPrefix());
}
} else {
@@ -1099,7 +1108,7 @@ static void genPropertyVerifier(
// {3}: Property description.
const char *const verifyPropertyInline = R"(
if (!({0}))
- return {1}"property '{2}' failed to satisfy constraint: {3}");
+ return {1}property '{2}' failed to satisfy constraint: {3}");
)";
// Verify the property using a uniqued constraint. Can only be used
@@ -1143,9 +1152,12 @@ static void genPropertyVerifier(
if (uniquedFn.has_value() && emitHelper.isEmittingForOp())
body << formatv(verifyPropertyUniqued, *uniquedFn, varName, prop.name);
else
- body << formatv(
- verifyPropertyInline, tgfmt(rawCondition, &ctx.withSelf(varName)),
- emitHelper.emitErrorPrefix(), prop.name, prop.prop.getSummary());
+ body << formatv(verifyPropertyInline,
+ tgfmt(rawCondition, &ctx.withSelf(varName)),
+ emitHelper.emitErrorPrefix(), prop.name,
+ buildErrorStreamingString(
+ prop.prop.getSummary(), ctx.withSelf(varName),
+ ErrorStreamType::InsideOpError));
}
}
diff --git a/mlir/unittests/TableGen/CMakeLists.txt b/mlir/unittests/TableGen/CMakeLists.txt
index c51bda6..4d8e508 100644
--- a/mlir/unittests/TableGen/CMakeLists.txt
+++ b/mlir/unittests/TableGen/CMakeLists.txt
@@ -25,6 +25,6 @@ target_include_directories(MLIRTableGenTests
)
target_link_libraries(MLIRTableGenTests
- PRIVATE MLIRTableGen MLIRIR
+ PRIVATE LLVMTableGen MLIRTableGen MLIRIR
PUBLIC MLIRTestDialect
)
diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h
index 2a283bd..79e45fd 100644
--- a/offload/include/Shared/Environment.h
+++ b/offload/include/Shared/Environment.h
@@ -21,7 +21,6 @@ enum class DeviceDebugKind : uint32_t {
Assertion = 1U << 0,
FunctionTracing = 1U << 1,
CommonIssues = 1U << 2,
- AllocationTracker = 1U << 3,
PGODump = 1U << 4,
};
@@ -36,27 +35,6 @@ struct DeviceEnvironmentTy {
uint64_t HardwareParallelism;
};
-struct DeviceMemoryPoolTy {
- void *Ptr;
- uint64_t Size;
-};
-
-struct DeviceMemoryPoolTrackingTy {
- uint64_t NumAllocations;
- uint64_t AllocationTotal;
- uint64_t AllocationMin;
- uint64_t AllocationMax;
-
- void combine(DeviceMemoryPoolTrackingTy &Other) {
- NumAllocations += Other.NumAllocations;
- AllocationTotal += Other.AllocationTotal;
- AllocationMin = AllocationMin > Other.AllocationMin ? Other.AllocationMin
- : AllocationMin;
- AllocationMax = AllocationMax < Other.AllocationMax ? Other.AllocationMax
- : AllocationMax;
- }
-};
-
// NOTE: Please don't change the order of those members as their indices are
// used in the middle end. Always add the new data member at the end.
// Different from KernelEnvironmentTy below, this structure contains members
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 928c6cd..04b3944 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3109,17 +3109,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
StackSize = Value;
return Plugin::success();
}
- Error getDeviceHeapSize(uint64_t &Value) override {
- Value = DeviceMemoryPoolSize;
- return Plugin::success();
- }
- Error setDeviceHeapSize(uint64_t Value) override {
- for (DeviceImageTy *Image : LoadedImages)
- if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value))
- return Err;
- DeviceMemoryPoolSize = Value;
- return Plugin::success();
- }
Error getDeviceMemorySize(uint64_t &Value) override {
for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
if (Pool->isGlobal()) {
@@ -3321,9 +3310,6 @@ private:
/// Reference to the host device.
AMDHostDeviceTy &HostDevice;
- /// The current size of the global device memory pool (managed by us).
- uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/;
-
/// The current size of the stack that will be used in cases where it could
/// not be statically determined.
uint64_t StackSize = 16 * 1024 /* 16 KB */;
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index f9dcdea..2135e06 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -819,10 +819,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
Error unloadBinary(DeviceImageTy *Image);
virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0;
- /// Setup the global device memory pool, if the plugin requires one.
- Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
- uint64_t PoolSize);
-
// Setup the RPC server for this device if needed. This may not run on some
// plugins like the CPU targets. By default, it will not be executed so it is
// up to the target to override this using the shouldSetupRPCServer function.
@@ -1067,6 +1063,16 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
virtual Error getDeviceStackSize(uint64_t &V) = 0;
+ virtual bool hasDeviceHeapSize() { return false; }
+ virtual Error getDeviceHeapSize(uint64_t &V) {
+ return Plugin::error(error::ErrorCode::UNSUPPORTED,
+ "%s not supported by platform", __func__);
+ }
+ virtual Error setDeviceHeapSize(uint64_t V) {
+ return Plugin::error(error::ErrorCode::UNSUPPORTED,
+ "%s not supported by platform", __func__);
+ }
+
/// Returns true if current plugin architecture is an APU
/// and unified_shared_memory was not requested by the program.
bool useAutoZeroCopy();
@@ -1159,12 +1165,6 @@ private:
/// plugin can implement the setters as no-op and setting the output
/// value to zero for the getters.
virtual Error setDeviceStackSize(uint64_t V) = 0;
- virtual Error getDeviceHeapSize(uint64_t &V) = 0;
- virtual Error setDeviceHeapSize(uint64_t V) = 0;
-
- /// Indicate whether the device should setup the global device memory pool. If
- /// false is return the value on the device will be uninitialized.
- virtual bool shouldSetupDeviceMemoryPool() const { return true; }
/// Indicate whether or not the device should setup the RPC server. This is
/// only necessary for unhosted targets like the GPU.
@@ -1251,10 +1251,6 @@ protected:
/// Internal representation for OMPT device (initialize & finalize)
std::atomic<bool> OmptInitialized;
#endif
-
-private:
- DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
- DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
};
/// Class implementing common functionalities of offload plugins. Each plugin
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index d7e5a21..ee2ecbc 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -762,13 +762,15 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
return StackSizeEnvarOrErr.takeError();
OMPX_TargetStackSize = std::move(*StackSizeEnvarOrErr);
- auto HeapSizeEnvarOrErr = UInt64Envar::create(
- "LIBOMPTARGET_HEAP_SIZE",
- [this](uint64_t &V) -> Error { return getDeviceHeapSize(V); },
- [this](uint64_t V) -> Error { return setDeviceHeapSize(V); });
- if (!HeapSizeEnvarOrErr)
- return HeapSizeEnvarOrErr.takeError();
- OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr);
+ if (hasDeviceHeapSize()) {
+ auto HeapSizeEnvarOrErr = UInt64Envar::create(
+ "LIBOMPTARGET_HEAP_SIZE",
+ [this](uint64_t &V) -> Error { return getDeviceHeapSize(V); },
+ [this](uint64_t V) -> Error { return setDeviceHeapSize(V); });
+ if (!HeapSizeEnvarOrErr)
+ return HeapSizeEnvarOrErr.takeError();
+ OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr);
+ }
// Update the maximum number of teams and threads after the device
// initialization sets the corresponding hardware limit.
@@ -795,19 +797,6 @@ Error GenericDeviceTy::unloadBinary(DeviceImageTy *Image) {
if (auto Err = callGlobalDestructors(Plugin, *Image))
return Err;
- if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
- GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
- DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
- GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
- sizeof(DeviceMemoryPoolTrackingTy),
- &ImageDeviceMemoryPoolTracking);
- if (auto Err =
- GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
- consumeError(std::move(Err));
- }
- DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
- }
-
GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
if (!ProfOrErr)
@@ -833,22 +822,6 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
return Err;
LoadedImages.clear();
- if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
- // TODO: Write this by default into a file.
- printf("\n\n|-----------------------\n"
- "| Device memory tracker:\n"
- "|-----------------------\n"
- "| #Allocations: %lu\n"
- "| Byes allocated: %lu\n"
- "| Minimal allocation: %lu\n"
- "| Maximal allocation: %lu\n"
- "|-----------------------\n\n\n",
- DeviceMemoryPoolTracking.NumAllocations,
- DeviceMemoryPoolTracking.AllocationTotal,
- DeviceMemoryPoolTracking.AllocationMin,
- DeviceMemoryPoolTracking.AllocationMax);
- }
-
// Delete the memory manager before deinitializing the device. Otherwise,
// we may delete device allocations after the device is deinitialized.
if (MemoryManager)
@@ -901,18 +874,6 @@ Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
// Add the image to list.
LoadedImages.push_back(Image);
- // Setup the global device memory pool if needed.
- if (!Plugin.getRecordReplay().isReplaying() &&
- shouldSetupDeviceMemoryPool()) {
- uint64_t HeapSize;
- auto SizeOrErr = getDeviceHeapSize(HeapSize);
- if (SizeOrErr) {
- REPORT("No global device memory pool due to error: %s\n",
- toString(std::move(SizeOrErr)).data());
- } else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize))
- return std::move(Err);
- }
-
if (auto Err = setupRPCServer(Plugin, *Image))
return std::move(Err);
@@ -936,51 +897,6 @@ Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
return Image;
}
-Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
- DeviceImageTy &Image,
- uint64_t PoolSize) {
- // Free the old pool, if any.
- if (DeviceMemoryPool.Ptr) {
- if (auto Err = dataDelete(DeviceMemoryPool.Ptr,
- TargetAllocTy::TARGET_ALLOC_DEVICE))
- return Err;
- }
-
- DeviceMemoryPool.Size = PoolSize;
- auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr,
- TargetAllocTy::TARGET_ALLOC_DEVICE);
- if (AllocOrErr) {
- DeviceMemoryPool.Ptr = *AllocOrErr;
- } else {
- auto Err = AllocOrErr.takeError();
- REPORT("Failure to allocate device memory for global memory pool: %s\n",
- toString(std::move(Err)).data());
- DeviceMemoryPool.Ptr = nullptr;
- DeviceMemoryPool.Size = 0;
- }
-
- // Create the metainfo of the device environment global.
- GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
- if (!GHandler.isSymbolInImage(*this, Image,
- "__omp_rtl_device_memory_pool_tracker")) {
- DP("Skip the memory pool as there is no tracker symbol in the image.");
- return Error::success();
- }
-
- GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
- sizeof(DeviceMemoryPoolTrackingTy),
- &DeviceMemoryPoolTracking);
- if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
- return Err;
-
- // Create the metainfo of the device environment global.
- GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool",
- sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool);
-
- // Write device environment values to the device.
- return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal);
-}
-
Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
DeviceImageTy &Image) {
// The plugin either does not need an RPC server or it is unavailable.
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index a9adcc3..45e580e7 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1235,11 +1235,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Info;
}
- virtual bool shouldSetupDeviceMemoryPool() const override {
- /// We use the CUDA malloc for now.
- return false;
- }
-
/// Getters and setters for stack and heap sizes.
Error getDeviceStackSize(uint64_t &Value) override {
return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
@@ -1247,6 +1242,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
Error setDeviceStackSize(uint64_t Value) override {
return setCtxLimit(CU_LIMIT_STACK_SIZE, Value);
}
+ bool hasDeviceHeapSize() override { return true; }
Error getDeviceHeapSize(uint64_t &Value) override {
return getCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
}
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index eb4ecac..48de1fef 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -380,9 +380,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
return Info;
}
- /// This plugin should not setup the device environment or memory pool.
- virtual bool shouldSetupDeviceMemoryPool() const override { return false; };
-
/// Getters and setters for stack size and heap size not relevant.
Error getDeviceStackSize(uint64_t &Value) override {
Value = 0;
@@ -391,11 +388,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
Error setDeviceStackSize(uint64_t Value) override {
return Plugin::success();
}
- Error getDeviceHeapSize(uint64_t &Value) override {
- Value = 0;
- return Plugin::success();
- }
- Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }
private:
/// Grid values for Generic ELF64 plugins.
diff --git a/offload/test/offloading/malloc_parallel.c b/offload/test/libc/malloc_parallel.c
index 076a7ba..076a7ba 100644
--- a/offload/test/offloading/malloc_parallel.c
+++ b/offload/test/libc/malloc_parallel.c
diff --git a/offload/test/mapping/lambda_mapping.cpp b/offload/test/mapping/lambda_mapping.cpp
index 63b1719..8e640b7 100644
--- a/offload/test/mapping/lambda_mapping.cpp
+++ b/offload/test/mapping/lambda_mapping.cpp
@@ -4,6 +4,8 @@
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
// RUN: %libomptarget-compileoptxx-run-and-check-generic
+// REQUIRES: libc
+
#include <iostream>
template <typename LOOP_BODY>
diff --git a/offload/test/offloading/interop-print.c b/offload/test/offloading/interop-print.c
index a386420..f7b37d9 100644
--- a/offload/test/offloading/interop-print.c
+++ b/offload/test/offloading/interop-print.c
@@ -8,6 +8,7 @@
// REQUIRES: gpu
// XFAIL: nvptx64-nvidia-cuda
+// XFAIL: nvptx64-nvidia-cuda-LTO
#include <omp.h>
#include <stdio.h>
diff --git a/offload/test/offloading/malloc.c b/offload/test/offloading/malloc.c
index 7b98e1f..04e72561 100644
--- a/offload/test/offloading/malloc.c
+++ b/offload/test/offloading/malloc.c
@@ -10,7 +10,7 @@ int main() {
int Threads = 64;
int Teams = 10;
- // Allocate ~55MB on the device.
+ // Allocate ~160 KiB on the device.
#pragma omp target map(from : DP)
DP = (long unsigned *)malloc(sizeof(long unsigned) * N * Threads * Teams);
diff --git a/openmp/device/include/Allocator.h b/openmp/device/include/Allocator.h
index dc4d029..507ec63 100644
--- a/openmp/device/include/Allocator.h
+++ b/openmp/device/include/Allocator.h
@@ -14,18 +14,12 @@
#include "DeviceTypes.h"
-// Forward declaration.
-struct KernelEnvironmentTy;
-
namespace ompx {
namespace allocator {
static uint64_t constexpr ALIGNMENT = 16;
-/// Initialize the allocator according to \p KernelEnvironment
-void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);
-
/// Allocate \p Size bytes.
[[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void *
alloc(uint64_t Size);
diff --git a/openmp/device/src/Allocator.cpp b/openmp/device/src/Allocator.cpp
index aac2a60..34c945c 100644
--- a/openmp/device/src/Allocator.cpp
+++ b/openmp/device/src/Allocator.cpp
@@ -18,42 +18,36 @@
#include "Synchronization.h"
using namespace ompx;
+using namespace allocator;
+
+// Provide a default implementation of malloc / free for AMDGPU platforms built
+// without 'libc' support.
+extern "C" {
+#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
+[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
+#else
+[[gnu::leaf]] void *malloc(size_t Size);
+[[gnu::leaf]] void free(void *Ptr);
+#endif
+}
-[[gnu::used, gnu::retain, gnu::weak,
- gnu::visibility(
- "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
-[[gnu::used, gnu::retain, gnu::weak,
- gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
- __omp_rtl_device_memory_pool_tracker;
+static constexpr uint64_t MEMORY_SIZE = /* 1 MiB */ 1024 * 1024;
+alignas(ALIGNMENT) static uint8_t Memory[MEMORY_SIZE] = {0};
-/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool
-/// directly.
+// Fallback bump pointer interface for platforms without a functioning
+// allocator.
struct BumpAllocatorTy final {
+ uint64_t Offset = 0;
void *alloc(uint64_t Size) {
Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));
- if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) {
- atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1,
- atomic::seq_cst);
- atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size,
- atomic::seq_cst);
- atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size,
- atomic::seq_cst);
- atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size,
- atomic::seq_cst);
- }
-
- uint64_t *Data =
- reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr);
- uint64_t End =
- reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size;
-
- uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst);
- if (OldData + Size > End)
+ uint64_t OldData = atomic::add(&Offset, Size, atomic::seq_cst);
+ if (OldData + Size >= MEMORY_SIZE)
__builtin_trap();
- return reinterpret_cast<void *>(OldData);
+ return &Memory[OldData];
}
void free(void *) {}
@@ -65,13 +59,20 @@ BumpAllocatorTy BumpAllocator;
///
///{
-void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
- // TODO: Check KernelEnvironment for an allocator choice as soon as we have
- // more than one.
+void *allocator::alloc(uint64_t Size) {
+#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+ return BumpAllocator.alloc(Size);
+#else
+ return ::malloc(Size);
+#endif
}
-void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
-
-void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
+void allocator::free(void *Ptr) {
+#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+ BumpAllocator.free(Ptr);
+#else
+ ::free(Ptr);
+#endif
+}
///}
diff --git a/openmp/device/src/Kernel.cpp b/openmp/device/src/Kernel.cpp
index 8c2828b..05af35d 100644
--- a/openmp/device/src/Kernel.cpp
+++ b/openmp/device/src/Kernel.cpp
@@ -41,7 +41,6 @@ inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
synchronize::init(IsSPMD);
mapping::init(IsSPMD);
state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment);
- allocator::init(IsSPMD, KernelEnvironment);
workshare::init(IsSPMD);
}
diff --git a/openmp/device/src/Misc.cpp b/openmp/device/src/Misc.cpp
index 563f674..a53fb43 100644
--- a/openmp/device/src/Misc.cpp
+++ b/openmp/device/src/Misc.cpp
@@ -100,7 +100,7 @@ void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
case omp_const_mem_alloc:
case omp_high_bw_mem_alloc:
case omp_low_lat_mem_alloc:
- return malloc(size);
+ return ompx::allocator::alloc(size);
default:
return nullptr;
}
@@ -113,7 +113,7 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
case omp_const_mem_alloc:
case omp_high_bw_mem_alloc:
case omp_low_lat_mem_alloc:
- free(ptr);
+ ompx::allocator::free(ptr);
return;
case omp_null_allocator:
default:
diff --git a/openmp/device/src/State.cpp b/openmp/device/src/State.cpp
index 4753951..9f38cf2 100644
--- a/openmp/device/src/State.cpp
+++ b/openmp/device/src/State.cpp
@@ -44,26 +44,6 @@ using namespace ompx;
namespace {
-/// Fallback implementations are missing to trigger a link time error.
-/// Implementations for new devices, including the host, should go into a
-/// dedicated begin/end declare variant.
-///
-///{
-extern "C" {
-#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
-
-[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
-[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
-
-#else
-
-[[gnu::weak, gnu::leaf]] void *malloc(size_t Size);
-[[gnu::weak, gnu::leaf]] void free(void *Ptr);
-
-#endif
-}
-///}
-
/// A "smart" stack in shared memory.
///
/// The stack exposes a malloc/free interface but works like a stack internally.
@@ -171,13 +151,13 @@ void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
}
void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
- void *Ptr = malloc(Bytes);
+ void *Ptr = allocator::alloc(Bytes);
if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr)
printf("nullptr returned by malloc!\n");
return Ptr;
}
-void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
+void memory::freeGlobal(void *Ptr, const char *Reason) { allocator::free(Ptr); }
///}
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index cd78a5b..1b6f30a 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -1521,5 +1521,4 @@ debugging features are supported.
* Enable debugging assertions in the device. ``0x01``
* Enable diagnosing common problems during offloading . ``0x4``
- * Enable device malloc statistics (amdgpu only). ``0x8``
* Dump device PGO counters (only if PGO on GPU is enabled). ``0x10``
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 83414ce..943ae10 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3816,6 +3816,40 @@ cc_library(
],
)
+gentbl_cc_library(
+ name = "XeGPUTransformOpsIncGen",
+ tbl_outs = {
+ "include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h.inc": ["-gen-op-decls"],
+ "include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp.inc": ["-gen-op-defs"],
+ },
+ tblgen = ":mlir-tblgen",
+ td_file = "include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td",
+ deps = [
+ ":TransformDialectTdFiles",
+ ],
+)
+
+cc_library(
+ name = "XeGPUTransformOps",
+ srcs = [
+ "lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp",
+ ],
+ hdrs = [
+ "include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h",
+ ],
+ includes = ["include"],
+ deps = [
+ ":DialectUtils",
+ ":IR",
+ ":SCFDialect",
+ ":TransformDialect",
+ ":TransformDialectInterfaces",
+ ":XeGPUDialect",
+ ":XeGPUTransformOpsIncGen",
+ ":XeGPUUtils",
+ ],
+)
+
td_library(
name = "FuncTdFiles",
srcs = [
@@ -9470,6 +9504,7 @@ cc_library(
":UBToLLVM",
":VectorToLLVM",
":VectorTransformOps",
+ ":XeGPUTransformOps",
":XeVMToLLVM",
":XeVMToLLVMIRTranslation",
],
@@ -10212,12 +10247,18 @@ cc_library(
hdrs = glob(["include/mlir/Dialect/OpenACC/Transforms/*.h"]),
includes = ["include"],
deps = [
+ ":Analysis",
+ ":ArithDialect",
":FuncDialect",
+ ":FunctionInterfaces",
":IR",
+ ":MemRefDialect",
+ ":OpenACCAnalysis",
":OpenACCDialect",
":OpenACCPassIncGen",
":Pass",
":TransformUtils",
+ ":ViewLikeInterface",
"//llvm:Support",
],
)