aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bolt/lib/Core/BinaryContext.cpp2
-rw-r--r--bolt/lib/Rewrite/DWARFRewriter.cpp2
-rw-r--r--clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp2
-rw-r--r--clang-tools-extra/clang-move/Move.cpp2
-rw-r--r--clang-tools-extra/clangd/ConfigCompile.cpp2
-rw-r--r--clang-tools-extra/clangd/SystemIncludeExtractor.cpp2
-rw-r--r--clang-tools-extra/clangd/index/SymbolCollector.cpp2
-rw-r--r--clang-tools-extra/clangd/tool/ClangdMain.cpp2
-rw-r--r--clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp2
-rw-r--r--clang/CMakeLists.txt15
-rw-r--r--clang/cmake/caches/BOLT-CSSPGO.cmake3
-rw-r--r--clang/cmake/caches/BOLT-PGO.cmake3
-rw-r--r--clang/cmake/caches/CSSPGO.cmake2
-rw-r--r--clang/include/clang/AST/CharUnits.h6
-rw-r--r--clang/include/clang/AST/Decl.h3
-rw-r--r--clang/include/clang/AST/ExprCXX.h8
-rw-r--r--clang/include/clang/Basic/LangOptions.h3
-rw-r--r--clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h7
-rw-r--r--clang/include/clang/CIR/Dialect/IR/CIROps.td10
-rw-r--r--clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td6
-rw-r--r--clang/include/clang/Driver/Options.td6
-rw-r--r--clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def8
-rw-r--r--clang/include/clang/Parse/ParseHLSLRootSignature.h3
-rw-r--r--clang/lib/AST/APValue.cpp2
-rw-r--r--clang/lib/AST/Decl.cpp47
-rw-r--r--clang/lib/AST/RecordLayoutBuilder.cpp9
-rw-r--r--clang/lib/AST/TextNodeDumper.cpp3
-rw-r--r--clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp64
-rw-r--r--clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp6
-rw-r--r--clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp8
-rw-r--r--clang/lib/CIR/Dialect/IR/CIRDialect.cpp11
-rw-r--r--clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp9
-rw-r--r--clang/lib/CodeGen/CGAtomic.cpp2
-rw-r--r--clang/lib/CodeGen/CGExprCXX.cpp56
-rw-r--r--clang/lib/CodeGen/CGExprConstant.cpp2
-rw-r--r--clang/lib/CodeGen/CGObjCMac.cpp2
-rw-r--r--clang/lib/CodeGen/CGOpenMPRuntime.cpp2
-rw-r--r--clang/lib/CodeGen/CGRecordLayoutBuilder.cpp12
-rw-r--r--clang/lib/Driver/ToolChains/HLSL.cpp2
-rw-r--r--clang/lib/Lex/HeaderSearch.cpp4
-rw-r--r--clang/lib/Parse/ParseHLSLRootSignature.cpp61
-rw-r--r--clang/lib/Sema/SemaChecking.cpp2
-rw-r--r--clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp4
-rw-r--r--clang/lib/StaticAnalyzer/Core/Store.cpp2
-rw-r--r--clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp2
-rw-r--r--clang/test/AST/HLSL/RootSignature-Target-AST.hlsl12
-rw-r--r--clang/test/AST/HLSL/RootSignatures-AST.hlsl27
-rw-r--r--clang/test/Analysis/Checkers/WebKit/objc-mock-types.h36
-rw-r--r--clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm12
-rw-r--r--clang/test/CIR/CodeGen/complex.cpp14
-rw-r--r--clang/test/CodeGenHLSL/RootSignature.hlsl5
-rw-r--r--clang/test/OpenMP/amdgcn_save_temps.c2
-rw-r--r--clang/test/SemaHLSL/RootSignature-err.hlsl4
-rw-r--r--clang/test/SemaHLSL/RootSignature-flags-err.hlsl24
-rw-r--r--clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp8
-rw-r--r--clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp8
-rw-r--r--clang/unittests/Frontend/CompilerInstanceTest.cpp2
-rw-r--r--clang/unittests/Lex/LexHLSLRootSignatureTest.cpp3
-rw-r--r--clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp34
-rw-r--r--clang/utils/TableGen/RISCVVEmitter.cpp8
-rw-r--r--clang/utils/perf-training/CMakeLists.txt36
-rw-r--r--clang/utils/perf-training/perf-helper.py65
-rw-r--r--flang-rt/lib/runtime/derived-api.cpp20
-rw-r--r--flang/include/flang/Semantics/openmp-utils.h2
-rw-r--r--flang/lib/Semantics/check-omp-structure.cpp12
-rw-r--r--flang/lib/Semantics/openmp-utils.cpp18
-rw-r--r--flang/lib/Semantics/resolve-directives.cpp35
-rw-r--r--flang/test/Lower/OpenMP/wsloop-collapse-continue.f9019
-rw-r--r--flang/test/Semantics/OpenMP/declare-simd.f905
-rw-r--r--flang/test/Semantics/OpenMP/do08.f901
-rw-r--r--flang/test/Semantics/OpenMP/do13.f901
-rw-r--r--libc/src/__support/macros/attributes.h10
-rw-r--r--libc/src/string/CMakeLists.txt1
-rw-r--r--libc/src/string/memory_utils/aarch64/inline_strlen.h2
-rw-r--r--libc/src/string/memory_utils/generic/inline_strlen.h3
-rw-r--r--libc/src/string/memory_utils/x86_64/inline_strlen.h4
-rw-r--r--libc/src/string/string_utils.h3
-rw-r--r--libunwind/test/configs/cmake-bridge.cfg.in11
-rw-r--r--libunwind/test/eh_frame_fde_pc_range.pass.cpp7
-rw-r--r--lldb/include/lldb/Target/Statistics.h2
-rw-r--r--lldb/source/API/SBTarget.cpp1
-rw-r--r--lldb/source/Commands/CommandObjectTarget.cpp6
-rw-r--r--lldb/source/Target/Statistics.cpp5
-rw-r--r--lldb/test/API/functionalities/json/symbol-file/Makefile1
-rw-r--r--lldb/test/API/functionalities/stats_api/TestStatisticsAPI.py28
-rw-r--r--lldb/test/API/functionalities/stats_api/arm64-minidump-build-ids.yaml19
-rw-r--r--llvm/CMakeLists.txt3
-rw-r--r--llvm/cmake/modules/HandleLLVMOptions.cmake30
-rw-r--r--llvm/docs/AMDGPU/AMDGPUAsmGFX12.rst2002
-rw-r--r--llvm/docs/AMDGPU/gfx12_addr.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_attr.rst28
-rw-r--r--llvm/docs/AMDGPU/gfx12_clause.rst7
-rw-r--r--llvm/docs/AMDGPU/gfx12_data0_56f215.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_data0_6802ce.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_data0_e016a1.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_data0_fd235e.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_data1_6802ce.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_data1_731030.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_data1_e016a1.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_data1_fd235e.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_delay.rst74
-rw-r--r--llvm/docs/AMDGPU/gfx12_hwreg.rst76
-rw-r--r--llvm/docs/AMDGPU/gfx12_imm16.rst7
-rw-r--r--llvm/docs/AMDGPU/gfx12_ioffset.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_label.rst29
-rw-r--r--llvm/docs/AMDGPU/gfx12_literal_1f74c7.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_literal_81e671.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_m.rst13
-rw-r--r--llvm/docs/AMDGPU/gfx12_rsrc_5fe6d8.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_rsrc_c9f929.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_saddr_cdc95c.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_saddr_d42b64.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_samp.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_sbase_453b95.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_sbase_47adb7.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_sdata_0974a4.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_sdata_354189.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_sdata_4585b8.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_sdata_5c7b50.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_sdata_6c003b.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_sdata_836716.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_sdata_d725ab.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_sdata_dd9dd8.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_sdst_006c40.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_sdst_20064d.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_sdst_354189.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_sdst_836716.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_sdst_ced58d.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_sdst_e701cc.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_sendmsg.rst48
-rw-r--r--llvm/docs/AMDGPU/gfx12_sendmsg_rtn.rst30
-rw-r--r--llvm/docs/AMDGPU/gfx12_simm16_15ccdd.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_simm16_218bea.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_simm16_39b593.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_simm16_3d2a4f.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_simm16_730a13.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_simm16_7ed651.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_simm16_81e671.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_simm16_c98889.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_simm16_cc1716.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_simm16_ee8b30.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_soffset_8ec073.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_soffset_c5b88c.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_soffset_ec005a.rst20
-rw-r--r--llvm/docs/AMDGPU/gfx12_src0_5727cf.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src0_5cae62.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src0_6802ce.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src0_85aab6.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src0_c4593f.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src0_e016a1.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src0_fd235e.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src1_5727cf.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src1_5cae62.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src1_6802ce.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src1_731030.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src1_977794.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src1_c4593f.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src1_e016a1.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src1_fd235e.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src2_2797bc.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src2_5727cf.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src2_5cae62.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src2_6802ce.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src2_7b936a.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src2_96fbd3.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src2_c4593f.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_src2_e016a1.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_srcx0.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_srcy0.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_ssrc0_007f9c.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_ssrc0_1a9ca5.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_ssrc0_245536.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_ssrc0_2797bc.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_ssrc0_bbb4c6.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_ssrc0_c4593f.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_ssrc1_bbb4c6.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_ssrc1_c4593f.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_tgt.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vaddr_a972b9.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_vaddr_c12f43.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_vaddr_c8b8d4.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_vaddr_d82160.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_vaddr_f2b449.rst15
-rw-r--r--llvm/docs/AMDGPU/gfx12_vcc.rst16
-rw-r--r--llvm/docs/AMDGPU/gfx12_vdata_2eda77.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vdata_48e42f.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vdata_69a144.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vdata_89680f.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vdata_aac3e8.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vdata_bdb32f.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vdst_006c40.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vdst_227281.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vdst_2eda77.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vdst_47d3bc.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vdst_48e42f.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vdst_69a144.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vdst_7de8e7.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vdst_836716.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vdst_89680f.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vdst_bdb32f.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vdstx.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vdsty.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_version.rst7
-rw-r--r--llvm/docs/AMDGPU/gfx12_vsrc0.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vsrc1_6802ce.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vsrc1_fd235e.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vsrc2.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vsrc3.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vsrc_56f215.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vsrc_6802ce.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vsrc_89fd7b.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vsrc_e016a1.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vsrc_fd235e.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vsrcx1.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_vsrcy1.rst17
-rw-r--r--llvm/docs/AMDGPU/gfx12_waitcnt.rst55
-rw-r--r--llvm/docs/AMDGPUModifierSyntax.rst109
-rw-r--r--llvm/docs/AMDGPUOperandSyntax.rst11
-rw-r--r--llvm/docs/AMDGPUUsage.rst2
-rw-r--r--llvm/docs/GettingInvolved.rst10
-rw-r--r--llvm/include/llvm/Analysis/IR2Vec.h263
-rw-r--r--llvm/include/llvm/BinaryFormat/DXContainer.h1
-rw-r--r--llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h7
-rw-r--r--llvm/include/llvm/Support/FileSystem.h12
-rw-r--r--llvm/include/llvm/Support/Path.h12
-rw-r--r--llvm/include/llvm/Target/TargetMachine.h4
-rw-r--r--llvm/lib/Analysis/CtxProfAnalysis.cpp4
-rw-r--r--llvm/lib/Analysis/IR2Vec.cpp274
-rw-r--r--llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp4
-rw-r--r--llvm/lib/Analysis/InlineAdvisor.cpp2
-rw-r--r--llvm/lib/Analysis/MemoryProfileInfo.cpp4
-rw-r--r--llvm/lib/Analysis/ModuleSummaryAnalysis.cpp2
-rw-r--r--llvm/lib/Analysis/ProfileSummaryInfo.cpp4
-rw-r--r--llvm/lib/CGData/CodeGenData.cpp3
-rw-r--r--llvm/lib/CGData/CodeGenDataReader.cpp4
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp6
-rw-r--r--llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp2
-rw-r--r--llvm/lib/CodeGen/MachineRegionInfo.cpp3
-rw-r--r--llvm/lib/CodeGen/RegAllocScore.cpp4
-rw-r--r--llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp2
-rw-r--r--llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp9
-rw-r--r--llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp14
-rw-r--r--llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp2
-rw-r--r--llvm/lib/IR/Instruction.cpp4
-rw-r--r--llvm/lib/IR/Value.cpp2
-rw-r--r--llvm/lib/LTO/LTO.cpp3
-rw-r--r--llvm/lib/Object/OffloadBundle.cpp5
-rw-r--r--llvm/lib/ObjectYAML/DXContainerYAML.cpp2
-rw-r--r--llvm/lib/Passes/PassBuilderPipelines.cpp3
-rw-r--r--llvm/lib/ProfileData/MemProfCommon.cpp4
-rw-r--r--llvm/lib/Support/Path.cpp100
-rw-r--r--llvm/lib/Support/ScopedPrinter.cpp17
-rw-r--r--llvm/lib/Support/VirtualFileSystem.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp38
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td11
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp4
-rw-r--r--llvm/lib/Target/TargetMachine.cpp2
-rw-r--r--llvm/lib/Transforms/IPO/FunctionImport.cpp4
-rw-r--r--llvm/lib/Transforms/IPO/FunctionSpecialization.cpp11
-rw-r--r--llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp3
-rw-r--r--llvm/lib/Transforms/IPO/SampleProfile.cpp3
-rw-r--r--llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp4
-rw-r--r--llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp4
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp29
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp10
-rw-r--r--llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp4
-rw-r--r--llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp2
-rw-r--r--llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp4
-rw-r--r--llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc2
-rw-r--r--llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/LICM.cpp8
-rw-r--r--llvm/lib/Transforms/Utils/FunctionImportUtils.cpp4
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp6
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp110
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp44
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp10
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp2
-rw-r--r--llvm/runtimes/CMakeLists.txt8
-rw-r--r--llvm/test/Analysis/IR2Vec/Inputs/dummy_2D_vocab.json28
-rw-r--r--llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_arg_vocab.json28
-rw-r--r--llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json29
-rw-r--r--llvm/test/Analysis/IR2Vec/Inputs/reference_default_vocab_print.txt26
-rw-r--r--llvm/test/Analysis/IR2Vec/Inputs/reference_wtd1_vocab_print.txt26
-rw-r--r--llvm/test/Analysis/IR2Vec/Inputs/reference_wtd2_vocab_print.txt26
-rw-r--r--llvm/test/Analysis/IR2Vec/if-else.ll2
-rw-r--r--llvm/test/Analysis/IR2Vec/unreachable.ll2
-rw-r--r--llvm/test/CMakeLists.txt4
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll1311
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll1207
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll1207
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll326
-rw-r--r--llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llc-pipeline.ll60
-rw-r--r--llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll4
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressU.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressV.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressW.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-BorderColor.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ComparisonFunc.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Filter.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Flag.ll19
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MaxAnisotropy.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MaxLod.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MinLod.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MinLopBias.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-RegisterSpace.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ShaderRegister.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ShaderVisibility.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers_V3.ll42
-rw-r--r--llvm/test/CodeGen/DirectX/rootsignature-validation-fail-sampler.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/rootsignature-validation-fail-static-sampler-range.ll4
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll36
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt9
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt45
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt12
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll39
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll460
-rw-r--r--llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll118
-rw-r--r--llvm/test/Unit/CMakeLists.txt5
-rw-r--r--llvm/test/tools/llvm-ir2vec/entities.ll28
-rw-r--r--llvm/tools/llvm-cgdata/llvm-cgdata.cpp2
-rw-r--r--llvm/tools/llvm-config/llvm-config.cpp6
-rw-r--r--llvm/tools/llvm-dwp/llvm-dwp.cpp2
-rw-r--r--llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp6
-rw-r--r--llvm/tools/llvm-opt-report/OptReport.cpp2
-rw-r--r--llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp13
-rw-r--r--llvm/unittests/Analysis/IR2VecTest.cpp388
-rw-r--r--llvm/unittests/Analysis/MemoryProfileInfoTest.cpp2
-rw-r--r--llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp5
-rw-r--r--llvm/unittests/CodeGen/RegAllocScoreTest.cpp3
-rw-r--r--llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp7
-rw-r--r--llvm/unittests/ProfileData/MemProfTest.cpp11
-rw-r--r--llvm/unittests/Support/Path.cpp4
-rw-r--r--mlir/lib/IR/Builders.cpp7
-rw-r--r--utils/bazel/llvm-project-overlay/libc/BUILD.bazel1
336 files changed, 9033 insertions, 3078 deletions
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 98440cd..b7ded6b 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -1662,7 +1662,7 @@ void BinaryContext::preprocessDWODebugInfo() {
"files.\n";
}
// Prevent failures when DWOName is already an absolute path.
- sys::fs::make_absolute(DWOCompDir, AbsolutePath);
+ sys::path::make_absolute(DWOCompDir, AbsolutePath);
DWARFUnit *DWOCU =
DwarfUnit->getNonSkeletonUnitDIE(false, AbsolutePath).getDwarfUnit();
if (!DWOCU->isDWOUnit()) {
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 5c89a42..7366d2a 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -1853,7 +1853,7 @@ void DWARFRewriter::writeDWOFiles(
else if (!sys::fs::exists(CompDir))
CompDir = ".";
// Prevent failures when DWOName is already an absolute path.
- sys::fs::make_absolute(CompDir, AbsolutePath);
+ sys::path::make_absolute(CompDir, AbsolutePath);
std::error_code EC;
std::unique_ptr<ToolOutputFile> TempOut =
diff --git a/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp b/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp
index b895075..0ac8f71 100644
--- a/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp
+++ b/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp
@@ -142,7 +142,7 @@ groupReplacements(const TUReplacements &TUs, const TUDiagnostics &TUDs,
// build directories, make them absolute immediately.
SmallString<128> Path = R.getFilePath();
if (BuildDir)
- llvm::sys::fs::make_absolute(*BuildDir, Path);
+ llvm::sys::path::make_absolute(*BuildDir, Path);
else
SM.getFileManager().makeAbsolutePath(Path);
diff --git a/clang-tools-extra/clang-move/Move.cpp b/clang-tools-extra/clang-move/Move.cpp
index 17f5971..519d3599 100644
--- a/clang-tools-extra/clang-move/Move.cpp
+++ b/clang-tools-extra/clang-move/Move.cpp
@@ -75,7 +75,7 @@ std::string MakeAbsolutePath(StringRef CurrentDir, StringRef Path) {
return "";
llvm::SmallString<128> InitialDirectory(CurrentDir);
llvm::SmallString<128> AbsolutePath(Path);
- llvm::sys::fs::make_absolute(InitialDirectory, AbsolutePath);
+ llvm::sys::path::make_absolute(InitialDirectory, AbsolutePath);
return CleanPath(std::move(AbsolutePath));
}
diff --git a/clang-tools-extra/clangd/ConfigCompile.cpp b/clang-tools-extra/clangd/ConfigCompile.cpp
index 962a48b..18e3180 100644
--- a/clang-tools-extra/clangd/ConfigCompile.cpp
+++ b/clang-tools-extra/clangd/ConfigCompile.cpp
@@ -131,7 +131,7 @@ struct FragmentCompiler {
return std::nullopt;
}
llvm::SmallString<256> AbsPath = llvm::StringRef(*Path);
- llvm::sys::fs::make_absolute(FragmentDirectory, AbsPath);
+ llvm::sys::path::make_absolute(FragmentDirectory, AbsPath);
llvm::sys::path::native(AbsPath, Style);
return AbsPath.str().str();
}
diff --git a/clang-tools-extra/clangd/SystemIncludeExtractor.cpp b/clang-tools-extra/clangd/SystemIncludeExtractor.cpp
index 106de1b..4a5cd3b 100644
--- a/clang-tools-extra/clangd/SystemIncludeExtractor.cpp
+++ b/clang-tools-extra/clangd/SystemIncludeExtractor.cpp
@@ -106,7 +106,7 @@ struct DriverArgs {
// relative or absolute).
if (llvm::any_of(Driver,
[](char C) { return llvm::sys::path::is_separator(C); })) {
- llvm::sys::fs::make_absolute(Cmd.Directory, Driver);
+ llvm::sys::path::make_absolute(Cmd.Directory, Driver);
}
this->Driver = Driver.str().str();
for (size_t I = 0, E = Cmd.CommandLine.size(); I < E; ++I) {
diff --git a/clang-tools-extra/clangd/index/SymbolCollector.cpp b/clang-tools-extra/clangd/index/SymbolCollector.cpp
index 6bdb108..39c479b 100644
--- a/clang-tools-extra/clangd/index/SymbolCollector.cpp
+++ b/clang-tools-extra/clangd/index/SymbolCollector.cpp
@@ -325,7 +325,7 @@ private:
if (R.second) {
llvm::SmallString<256> AbsPath = Path;
if (!llvm::sys::path::is_absolute(AbsPath) && !FallbackDir.empty())
- llvm::sys::fs::make_absolute(FallbackDir, AbsPath);
+ llvm::sys::path::make_absolute(FallbackDir, AbsPath);
assert(llvm::sys::path::is_absolute(AbsPath) &&
"If the VFS can't make paths absolute, a FallbackDir must be "
"provided");
diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
index 4de2f21..4a990f8 100644
--- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -578,7 +578,7 @@ public:
Body = Body.ltrim('/');
llvm::SmallString<16> Path(Body);
path::native(Path);
- fs::make_absolute(TestScheme::TestDir, Path);
+ path::make_absolute(TestScheme::TestDir, Path);
return std::string(Path);
}
diff --git a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
index 372ab5f..fefbfc3 100644
--- a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
+++ b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
@@ -344,7 +344,7 @@ mapInputsToAbsPaths(clang::tooling::CompilationDatabase &CDB,
}
for (const auto &Cmd : Cmds) {
llvm::SmallString<256> CDBPath(Cmd.Filename);
- llvm::sys::fs::make_absolute(Cmd.Directory, CDBPath);
+ llvm::sys::path::make_absolute(Cmd.Directory, CDBPath);
CDBToAbsPaths[std::string(CDBPath)] = std::string(AbsPath);
}
}
diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index 4eaa7128..e4cb1a3 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -754,11 +754,22 @@ if (CLANG_ENABLE_BOOTSTRAP)
if(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED)
add_dependencies(clang-bootstrap-deps llvm-profdata)
set(PGO_OPT -DLLVM_PROFDATA=${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-profdata)
+ string(TOUPPER "${BOOTSTRAP_LLVM_BUILD_INSTRUMENTED}" BOOTSTRAP_LLVM_BUILD_INSTRUMENTED)
+ if (BOOTSTRAP_LLVM_BUILD_INSTRUMENTED STREQUAL "CSSPGO")
+ add_dependencies(clang-bootstrap-deps llvm-profgen)
+ list(APPEND PGO_OPT -DLLVM_PROFGEN=${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-profgen)
+ endif()
endif()
if(LLVM_BUILD_INSTRUMENTED)
- add_dependencies(clang-bootstrap-deps generate-profdata)
- set(PGO_OPT -DLLVM_PROFDATA_FILE=${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/clang.profdata)
+ string(TOUPPER "${LLVM_BUILD_INSTRUMENTED}" LLVM_BUILD_INSTRUMENTED)
+ if (LLVM_BUILD_INSTRUMENTED STREQUAL "CSSPGO")
+ add_dependencies(clang-bootstrap-deps generate-sprofdata)
+ set(PGO_OPT -DLLVM_SPROFDATA_FILE=${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/clang.sprofdata)
+ else()
+ add_dependencies(clang-bootstrap-deps generate-profdata)
+ set(PGO_OPT -DLLVM_PROFDATA_FILE=${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/clang.profdata)
+ endif()
# Use the current tools for LTO instead of the instrumented ones
list(APPEND _BOOTSTRAP_DEFAULT_PASSTHROUGH
CMAKE_CXX_COMPILER
diff --git a/clang/cmake/caches/BOLT-CSSPGO.cmake b/clang/cmake/caches/BOLT-CSSPGO.cmake
new file mode 100644
index 0000000..b1c204a
--- /dev/null
+++ b/clang/cmake/caches/BOLT-CSSPGO.cmake
@@ -0,0 +1,3 @@
+set(BOLT_PGO_CMAKE_CACHE "CSSPGO" CACHE STRING "")
+set(BOOTSTRAP_CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "")
+include(${CMAKE_CURRENT_LIST_DIR}/BOLT-PGO.cmake)
diff --git a/clang/cmake/caches/BOLT-PGO.cmake b/clang/cmake/caches/BOLT-PGO.cmake
index 1a04ca9..cc9410f 100644
--- a/clang/cmake/caches/BOLT-PGO.cmake
+++ b/clang/cmake/caches/BOLT-PGO.cmake
@@ -1,3 +1,4 @@
+set(BOLT_PGO_CMAKE_CACHE "PGO" CACHE STRING "")
set(LLVM_ENABLE_PROJECTS "bolt;clang;lld" CACHE STRING "")
set(CLANG_BOOTSTRAP_TARGETS
@@ -14,4 +15,4 @@ set(BOOTSTRAP_CLANG_BOOTSTRAP_TARGETS
set(PGO_BUILD_CONFIGURATION
${CMAKE_CURRENT_LIST_DIR}/BOLT.cmake
CACHE STRING "")
-include(${CMAKE_CURRENT_LIST_DIR}/PGO.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/${BOLT_PGO_CMAKE_CACHE}.cmake)
diff --git a/clang/cmake/caches/CSSPGO.cmake b/clang/cmake/caches/CSSPGO.cmake
new file mode 100644
index 0000000..59e08a6
--- /dev/null
+++ b/clang/cmake/caches/CSSPGO.cmake
@@ -0,0 +1,2 @@
+set(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED "CSSPGO" CACHE STRING "")
+include(${CMAKE_CURRENT_LIST_DIR}/PGO.cmake)
diff --git a/clang/include/clang/AST/CharUnits.h b/clang/include/clang/AST/CharUnits.h
index c06354451..e570bfa 100644
--- a/clang/include/clang/AST/CharUnits.h
+++ b/clang/include/clang/AST/CharUnits.h
@@ -141,7 +141,7 @@ namespace clang {
/// Among other things, this promises that
/// self.alignTo(N) will just return self.
bool isMultipleOf(CharUnits N) const {
- return (*this % N) == 0;
+ return (*this % N) == CharUnits::Zero();
}
// Arithmetic operators.
@@ -165,8 +165,8 @@ namespace clang {
CharUnits operator% (QuantityType N) const {
return CharUnits(Quantity % N);
}
- QuantityType operator% (const CharUnits &Other) const {
- return Quantity % Other.Quantity;
+ CharUnits operator%(const CharUnits &Other) const {
+ return CharUnits(Quantity % Other.Quantity);
}
CharUnits operator+ (const CharUnits &Other) const {
return CharUnits(Quantity + Other.Quantity);
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index d85d04d..406d79e 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -80,6 +80,7 @@ class TypeAliasTemplateDecl;
class UnresolvedSetImpl;
class VarTemplateDecl;
enum class ImplicitParamKind;
+struct UsualDeleteParams;
// Holds a constraint expression along with a pack expansion index, if
// expanded.
@@ -2646,6 +2647,8 @@ public:
bool isTypeAwareOperatorNewOrDelete() const;
void setIsTypeAwareOperatorNewOrDelete(bool IsTypeAwareOperator = true);
+ UsualDeleteParams getUsualDeleteParams() const;
+
/// Compute the language linkage.
LanguageLinkage getLanguageLinkage() const;
diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h
index 5f16bac..d78c7b6 100644
--- a/clang/include/clang/AST/ExprCXX.h
+++ b/clang/include/clang/AST/ExprCXX.h
@@ -2342,6 +2342,14 @@ struct ImplicitDeallocationParameters {
SizedDeallocationMode PassSize;
};
+/// The parameters to pass to a usual operator delete.
+struct UsualDeleteParams {
+ TypeAwareAllocationMode TypeAwareDelete = TypeAwareAllocationMode::No;
+ bool DestroyingDelete = false;
+ bool Size = false;
+ AlignedAllocationMode Alignment = AlignedAllocationMode::No;
+};
+
/// Represents a new-expression for memory allocation and constructor
/// calls, e.g: "new CXXNewExpr(foo)".
class CXXNewExpr final
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index a8943df..41595ec 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -549,8 +549,7 @@ public:
bool CheckNew = false;
/// The HLSL root signature version for dxil.
- llvm::dxbc::RootSignatureVersion HLSLRootSigVer =
- llvm::dxbc::RootSignatureVersion::V1_1;
+ llvm::dxbc::RootSignatureVersion HLSLRootSigVer;
/// The HLSL root signature that will be used to overide the root signature
/// used for the shader entry point.
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index 3f83c30..8a5bf03 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -148,9 +148,10 @@ public:
}
mlir::Value createComplexReal(mlir::Location loc, mlir::Value operand) {
- auto operandTy = mlir::cast<cir::ComplexType>(operand.getType());
- return cir::ComplexRealOp::create(*this, loc, operandTy.getElementType(),
- operand);
+ auto resultType = operand.getType();
+ if (auto complexResultType = mlir::dyn_cast<cir::ComplexType>(resultType))
+ resultType = complexResultType.getElementType();
+ return cir::ComplexRealOp::create(*this, loc, resultType, operand);
}
mlir::Value createComplexImag(mlir::Location loc, mlir::Value operand) {
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index f857cf8..0a78492 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -3260,18 +3260,20 @@ def CIR_ComplexCreateOp : CIR_Op<"complex.create", [Pure, SameTypeOperands]> {
def CIR_ComplexRealOp : CIR_Op<"complex.real", [Pure]> {
let summary = "Extract the real part of a complex value";
let description = [{
- `cir.complex.real` operation takes an operand of `!cir.complex` type and
- yields the real part of it.
+ `cir.complex.real` operation takes an operand of `!cir.complex`, `!cir.int`
+ or `!cir.float`. If the operand is `!cir.complex`, the real part of it will
+ be returned, otherwise the value returned unmodified.
Example:
```mlir
- %1 = cir.complex.real %0 : !cir.complex<!cir.float> -> !cir.float
+ %real = cir.complex.real %complex : !cir.complex<!cir.float> -> !cir.float
+ %real = cir.complex.real %scalar : !cir.float -> !cir.float
```
}];
let results = (outs CIR_AnyIntOrFloatType:$result);
- let arguments = (ins CIR_ComplexType:$operand);
+ let arguments = (ins CIR_AnyComplexOrIntOrFloatType:$operand);
let assemblyFormat = [{
$operand `:` qualified(type($operand)) `->` qualified(type($result))
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td b/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
index 82f6e1d..da03a29 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
@@ -165,6 +165,12 @@ def CIR_AnyIntOrFloatType : AnyTypeOf<[CIR_AnyFloatType, CIR_AnyIntType],
def CIR_AnyComplexType : CIR_TypeBase<"::cir::ComplexType", "complex type">;
+def CIR_AnyComplexOrIntOrFloatType : AnyTypeOf<[
+ CIR_AnyComplexType, CIR_AnyFloatType, CIR_AnyIntType
+], "complex, integer or floating point type"> {
+ let cppFunctionName = "isComplexOrIntegerOrFloatingPointType";
+}
+
//===----------------------------------------------------------------------===//
// Array Type predicates
//===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 6245cf33..096df56 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -9473,7 +9473,7 @@ def target_profile : DXCJoinedOrSeparate<"T">, MetaVarName<"<profile>">,
"lib_6_3, lib_6_4, lib_6_5, lib_6_6, lib_6_7, lib_6_x,"
"ms_6_5, ms_6_6, ms_6_7,"
"as_6_5, as_6_6, as_6_7,"
- "rootsig_1_0, rootsig_1_1">;
+ "rootsig_1_0, rootsig_1_1, rootsig_1_2">;
def emit_pristine_llvm : DXCFlag<"emit-pristine-llvm">,
HelpText<"Emit pristine LLVM IR from the frontend by not running any LLVM passes at all."
"Same as -S + -emit-llvm + -disable-llvm-passes.">;
@@ -9486,9 +9486,9 @@ def fdx_rootsignature_version :
Group<dxc_Group>,
Visibility<[ClangOption, CC1Option]>,
HelpText<"Root Signature Version">,
- Values<"rootsig_1_0,rootsig_1_1">,
+ Values<"rootsig_1_0,rootsig_1_1,rootsig_1_2">,
NormalizedValuesScope<"llvm::dxbc::RootSignatureVersion">,
- NormalizedValues<["V1_0", "V1_1"]>,
+ NormalizedValues<["V1_0", "V1_1", "V1_2"]>,
MarshallingInfoEnum<LangOpts<"HLSLRootSigVer">, "V1_1">;
def dxc_rootsig_ver :
Separate<["/", "-"], "force-rootsig-ver">,
diff --git a/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def b/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def
index a5cfeb3..1d7f7ad 100644
--- a/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def
+++ b/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def
@@ -65,6 +65,9 @@
#ifndef STATIC_BORDER_COLOR_ENUM
#define STATIC_BORDER_COLOR_ENUM(NAME, LIT) ENUM(NAME, LIT)
#endif
+#ifndef STATIC_SAMPLER_FLAG_ENUM
+#define STATIC_SAMPLER_FLAG_ENUM(NAME, LIT) ENUM(NAME, LIT)
+#endif
// General Tokens:
TOK(invalid, "invalid identifier")
@@ -228,6 +231,10 @@ STATIC_BORDER_COLOR_ENUM(OpaqueWhite, "STATIC_BORDER_COLOR_OPAQUE_WHITE")
STATIC_BORDER_COLOR_ENUM(OpaqueBlackUint, "STATIC_BORDER_COLOR_OPAQUE_BLACK_UINT")
STATIC_BORDER_COLOR_ENUM(OpaqueWhiteUint, "STATIC_BORDER_COLOR_OPAQUE_WHITE_UINT")
+// Root Descriptor Flag Enums:
+STATIC_SAMPLER_FLAG_ENUM(UintBorderColor, "UINT_BORDER_COLOR")
+STATIC_SAMPLER_FLAG_ENUM(NonNormalizedCoordinates, "NON_NORMALIZED_COORDINATES")
+
#undef STATIC_BORDER_COLOR_ENUM
#undef COMPARISON_FUNC_ENUM
#undef TEXTURE_ADDRESS_MODE_ENUM
@@ -237,6 +244,7 @@ STATIC_BORDER_COLOR_ENUM(OpaqueWhiteUint, "STATIC_BORDER_COLOR_OPAQUE_WHITE_UINT
#undef DESCRIPTOR_RANGE_FLAG_ENUM_OFF
#undef DESCRIPTOR_RANGE_FLAG_ENUM_ON
#undef ROOT_DESCRIPTOR_FLAG_ENUM
+#undef STATIC_SAMPLER_FLAG_ENUM
#undef ROOT_FLAG_ENUM
#undef DESCRIPTOR_RANGE_OFFSET_ENUM
#undef UNBOUNDED_ENUM
diff --git a/clang/include/clang/Parse/ParseHLSLRootSignature.h b/clang/include/clang/Parse/ParseHLSLRootSignature.h
index b06846f..8f91d7c 100644
--- a/clang/include/clang/Parse/ParseHLSLRootSignature.h
+++ b/clang/include/clang/Parse/ParseHLSLRootSignature.h
@@ -130,6 +130,7 @@ private:
std::optional<float> MaxLOD;
std::optional<uint32_t> Space;
std::optional<llvm::dxbc::ShaderVisibility> Visibility;
+ std::optional<llvm::dxbc::StaticSamplerFlags> Flags;
};
std::optional<ParsedStaticSamplerParams> parseStaticSamplerParams();
@@ -153,6 +154,8 @@ private:
parseRootDescriptorFlags(RootSignatureToken::Kind Context);
std::optional<llvm::dxbc::DescriptorRangeFlags>
parseDescriptorRangeFlags(RootSignatureToken::Kind Context);
+ std::optional<llvm::dxbc::StaticSamplerFlags>
+ parseStaticSamplerFlags(RootSignatureToken::Kind Context);
/// Use NumericLiteralParser to convert CurToken.NumSpelling into a unsigned
/// 32-bit integer
diff --git a/clang/lib/AST/APValue.cpp b/clang/lib/AST/APValue.cpp
index 7173c2a..2e1c8eb 100644
--- a/clang/lib/AST/APValue.cpp
+++ b/clang/lib/AST/APValue.cpp
@@ -784,7 +784,7 @@ void APValue::printPretty(raw_ostream &Out, const PrintingPolicy &Policy,
if (!O.isZero()) {
if (IsReference)
Out << "*(";
- if (S.isZero() || O % S) {
+ if (S.isZero() || !O.isMultipleOf(S)) {
Out << "(char*)";
S = CharUnits::One();
}
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index cd8e495..c734155 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3552,6 +3552,53 @@ void FunctionDecl::setIsTypeAwareOperatorNewOrDelete(bool IsTypeAware) {
getASTContext().setIsTypeAwareOperatorNewOrDelete(this, IsTypeAware);
}
+UsualDeleteParams FunctionDecl::getUsualDeleteParams() const {
+ UsualDeleteParams Params;
+
+ // This function should only be called for operator delete declarations.
+ assert(getDeclName().isAnyOperatorDelete());
+ if (!getDeclName().isAnyOperatorDelete())
+ return Params;
+
+ const FunctionProtoType *FPT = getType()->castAs<FunctionProtoType>();
+ auto AI = FPT->param_type_begin(), AE = FPT->param_type_end();
+
+ if (isTypeAwareOperatorNewOrDelete()) {
+ Params.TypeAwareDelete = TypeAwareAllocationMode::Yes;
+ assert(AI != AE);
+ ++AI;
+ }
+
+ // The first argument after the type-identity parameter (if any) is
+ // always a void* (or C* for a destroying operator delete for class
+ // type C).
+ ++AI;
+
+ // The next parameter may be a std::destroying_delete_t.
+ if (isDestroyingOperatorDelete()) {
+ assert(!isTypeAwareAllocation(Params.TypeAwareDelete));
+ Params.DestroyingDelete = true;
+ assert(AI != AE);
+ ++AI;
+ }
+
+ // Figure out what other parameters we should be implicitly passing.
+ if (AI != AE && (*AI)->isIntegerType()) {
+ Params.Size = true;
+ ++AI;
+ } else
+ assert(!isTypeAwareAllocation(Params.TypeAwareDelete));
+
+ if (AI != AE && (*AI)->isAlignValT()) {
+ Params.Alignment = AlignedAllocationMode::Yes;
+ ++AI;
+ } else
+ assert(!isTypeAwareAllocation(Params.TypeAwareDelete));
+
+ assert(AI == AE && "unexpected usual deallocation function parameter");
+ return Params;
+}
+
LanguageLinkage FunctionDecl::getLanguageLinkage() const {
return getDeclLanguageLinkage(*this);
}
diff --git a/clang/lib/AST/RecordLayoutBuilder.cpp b/clang/lib/AST/RecordLayoutBuilder.cpp
index 43f4e07..00b938b 100644
--- a/clang/lib/AST/RecordLayoutBuilder.cpp
+++ b/clang/lib/AST/RecordLayoutBuilder.cpp
@@ -2087,9 +2087,8 @@ void ItaniumRecordLayoutBuilder::LayoutField(const FieldDecl *D,
if (InsertExtraPadding) {
CharUnits ASanAlignment = CharUnits::fromQuantity(8);
CharUnits ExtraSizeForAsan = ASanAlignment;
- if (FieldSize % ASanAlignment)
- ExtraSizeForAsan +=
- ASanAlignment - CharUnits::fromQuantity(FieldSize % ASanAlignment);
+ if (!FieldSize.isMultipleOf(ASanAlignment))
+ ExtraSizeForAsan += ASanAlignment - (FieldSize % ASanAlignment);
EffectiveFieldSize = FieldSize = FieldSize + ExtraSizeForAsan;
}
@@ -2119,10 +2118,10 @@ void ItaniumRecordLayoutBuilder::LayoutField(const FieldDecl *D,
if (RD->hasAttr<PackedAttr>() || !MaxFieldAlignment.isZero())
if (FieldAlign < OriginalFieldAlign)
if (D->getType()->isRecordType()) {
- // If the offset is a multiple of the alignment of
+ // If the offset is not a multiple of the alignment of
// the type, raise the warning.
// TODO: Takes no account the alignment of the outer struct
- if (FieldOffset % OriginalFieldAlign != 0)
+ if (!FieldOffset.isMultipleOf(OriginalFieldAlign))
Diag(D->getLocation(), diag::warn_unaligned_access)
<< Context.getCanonicalTagType(RD) << D->getName()
<< D->getType();
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 8f7fe3b..cf5e914 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -3095,6 +3095,9 @@ void TextNodeDumper::VisitHLSLRootSignatureDecl(
case llvm::dxbc::RootSignatureVersion::V1_1:
OS << "1.1";
break;
+ case llvm::dxbc::RootSignatureVersion::V1_2:
+ OS << "1.2";
+ break;
}
OS << ", ";
llvm::hlsl::rootsig::dumpRootElements(OS, D->getRootElements());
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
index 83208bf..7989ad2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
@@ -210,60 +210,6 @@ RValue CIRGenFunction::emitCXXMemberOrOperatorCall(
return emitCall(fnInfo, callee, returnValue, args, nullptr, loc);
}
-namespace {
-/// The parameters to pass to a usual operator delete.
-struct UsualDeleteParams {
- TypeAwareAllocationMode typeAwareDelete = TypeAwareAllocationMode::No;
- bool destroyingDelete = false;
- bool size = false;
- AlignedAllocationMode alignment = AlignedAllocationMode::No;
-};
-} // namespace
-
-// FIXME(cir): this should be shared with LLVM codegen
-static UsualDeleteParams getUsualDeleteParams(const FunctionDecl *fd) {
- UsualDeleteParams params;
-
- const FunctionProtoType *fpt = fd->getType()->castAs<FunctionProtoType>();
- auto ai = fpt->param_type_begin(), ae = fpt->param_type_end();
-
- if (fd->isTypeAwareOperatorNewOrDelete()) {
- params.typeAwareDelete = TypeAwareAllocationMode::Yes;
- assert(ai != ae);
- ++ai;
- }
-
- // The first argument after the type-identity parameter (if any) is
- // always a void* (or C* for a destroying operator delete for class
- // type C).
- ++ai;
-
- // The next parameter may be a std::destroying_delete_t.
- if (fd->isDestroyingOperatorDelete()) {
- params.destroyingDelete = true;
- assert(ai != ae);
- ++ai;
- }
-
- // Figure out what other parameters we should be implicitly passing.
- if (ai != ae && (*ai)->isIntegerType()) {
- params.size = true;
- ++ai;
- } else {
- assert(!isTypeAwareAllocation(params.typeAwareDelete));
- }
-
- if (ai != ae && (*ai)->isAlignValT()) {
- params.alignment = AlignedAllocationMode::Yes;
- ++ai;
- } else {
- assert(!isTypeAwareAllocation(params.typeAwareDelete));
- }
-
- assert(ai == ae && "unexpected usual deallocation function parameter");
- return params;
-}
-
static mlir::Value emitCXXNewAllocSize(CIRGenFunction &cgf, const CXXNewExpr *e,
unsigned minElements,
mlir::Value &numElements,
@@ -616,11 +562,11 @@ void CIRGenFunction::emitDeleteCall(const FunctionDecl *deleteFD,
const auto *deleteFTy = deleteFD->getType()->castAs<FunctionProtoType>();
CallArgList deleteArgs;
- UsualDeleteParams params = getUsualDeleteParams(deleteFD);
+ UsualDeleteParams params = deleteFD->getUsualDeleteParams();
auto paramTypeIt = deleteFTy->param_type_begin();
// Pass std::type_identity tag if present
- if (isTypeAwareAllocation(params.typeAwareDelete))
+ if (isTypeAwareAllocation(params.TypeAwareDelete))
cgm.errorNYI(deleteFD->getSourceRange(),
"emitDeleteCall: type aware delete");
@@ -631,12 +577,12 @@ void CIRGenFunction::emitDeleteCall(const FunctionDecl *deleteFD,
deleteArgs.add(RValue::get(deletePtr), argTy);
// Pass the std::destroying_delete tag if present.
- if (params.destroyingDelete)
+ if (params.DestroyingDelete)
cgm.errorNYI(deleteFD->getSourceRange(),
"emitDeleteCall: destroying delete");
// Pass the size if the delete function has a size_t parameter.
- if (params.size) {
+ if (params.Size) {
QualType sizeType = *paramTypeIt++;
CharUnits deleteTypeSize = getContext().getTypeSizeInChars(deleteTy);
assert(mlir::isa<cir::IntType>(convertType(sizeType)) &&
@@ -648,7 +594,7 @@ void CIRGenFunction::emitDeleteCall(const FunctionDecl *deleteFD,
}
// Pass the alignment if the delete function has an align_val_t parameter.
- if (isAlignedAllocation(params.alignment))
+ if (isAlignedAllocation(params.Alignment))
cgm.errorNYI(deleteFD->getSourceRange(),
"emitDeleteCall: aligned allocation");
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index f4bbced..500007f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -2151,8 +2151,10 @@ mlir::Value ScalarExprEmitter::VisitRealImag(const UnaryOperator *e,
}
if (e->getOpcode() == UO_Real) {
- return promotionTy.isNull() ? Visit(op)
- : cgf.emitPromotedScalarExpr(op, promotionTy);
+ mlir::Value operand = promotionTy.isNull()
+ ? Visit(op)
+ : cgf.emitPromotedScalarExpr(op, promotionTy);
+ return builder.createComplexReal(loc, operand);
}
// __imag on a scalar returns zero. Emit the subexpr to ensure side
diff --git a/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp b/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp
index a762881..2baeb43 100644
--- a/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp
@@ -615,7 +615,7 @@ void CIRRecordLowering::determinePacked(bool nvBaseType) {
continue;
// If any member falls at an offset that it not a multiple of its alignment,
// then the entire record must be packed.
- if (member.offset % getAlignment(member.data))
+ if (!member.offset.isMultipleOf(getAlignment(member.data)))
packed = true;
if (member.offset < nvSize)
nvAlignment = std::max(nvAlignment, getAlignment(member.data));
@@ -623,12 +623,12 @@ void CIRRecordLowering::determinePacked(bool nvBaseType) {
}
// If the size of the record (the capstone's offset) is not a multiple of the
// record's alignment, it must be packed.
- if (members.back().offset % alignment)
+ if (!members.back().offset.isMultipleOf(alignment))
packed = true;
// If the non-virtual sub-object is not a multiple of the non-virtual
// sub-object's alignment, it must be packed. We cannot have a packed
// non-virtual sub-object and an unpacked complete object or vise versa.
- if (nvSize % nvAlignment)
+ if (!nvSize.isMultipleOf(nvAlignment))
packed = true;
// Update the alignment of the sentinel.
if (!packed)
@@ -824,7 +824,7 @@ void CIRRecordLowering::lowerUnion() {
appendPaddingBytes(layoutSize - getSize(storageType));
// Set packed if we need it.
- if (layoutSize % getAlignment(storageType))
+ if (!layoutSize.isMultipleOf(getAlignment(storageType)))
packed = true;
}
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index fb87036..6b5cc80 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -2388,14 +2388,23 @@ OpFoldResult cir::ComplexCreateOp::fold(FoldAdaptor adaptor) {
//===----------------------------------------------------------------------===//
LogicalResult cir::ComplexRealOp::verify() {
- if (getType() != getOperand().getType().getElementType()) {
+ mlir::Type operandTy = getOperand().getType();
+ if (auto complexOperandTy = mlir::dyn_cast<cir::ComplexType>(operandTy)) {
+ operandTy = complexOperandTy.getElementType();
+ }
+
+ if (getType() != operandTy) {
emitOpError() << ": result type does not match operand type";
return failure();
}
+
return success();
}
OpFoldResult cir::ComplexRealOp::fold(FoldAdaptor adaptor) {
+ if (!mlir::isa<cir::ComplexType>(getOperand().getType()))
+ return nullptr;
+
if (auto complexCreateOp = getOperand().getDefiningOp<cir::ComplexCreateOp>())
return complexCreateOp.getOperand(0);
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 22f069d..4bc7783 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -2999,8 +2999,13 @@ mlir::LogicalResult CIRToLLVMComplexRealOpLowering::matchAndRewrite(
cir::ComplexRealOp op, OpAdaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const {
mlir::Type resultLLVMTy = getTypeConverter()->convertType(op.getType());
- rewriter.replaceOpWithNewOp<mlir::LLVM::ExtractValueOp>(
- op, resultLLVMTy, adaptor.getOperand(), llvm::ArrayRef<std::int64_t>{0});
+ mlir::Value operand = adaptor.getOperand();
+ if (mlir::isa<cir::ComplexType>(op.getOperand().getType())) {
+ operand = mlir::LLVM::ExtractValueOp::create(
+ rewriter, op.getLoc(), resultLLVMTy, operand,
+ llvm::ArrayRef<std::int64_t>{0});
+ }
+ rewriter.replaceOp(op, operand);
return mlir::success();
}
diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
index eeb0fd6..4a3446a 100644
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -880,7 +880,7 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
CharUnits MaxInlineWidth =
getContext().toCharUnitsFromBits(MaxInlineWidthInBits);
DiagnosticsEngine &Diags = CGM.getDiags();
- bool Misaligned = (Ptr.getAlignment() % TInfo.Width) != 0;
+ bool Misaligned = !Ptr.getAlignment().isMultipleOf(TInfo.Width);
bool Oversized = getContext().toBits(TInfo.Width) > MaxInlineWidthInBits;
if (Misaligned) {
Diags.Report(E->getBeginLoc(), diag::warn_atomic_op_misaligned)
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index a092b71..c52526c 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -1377,58 +1377,6 @@ RValue CodeGenFunction::EmitBuiltinNewDeleteCall(const FunctionProtoType *Type,
}
namespace {
-/// The parameters to pass to a usual operator delete.
-struct UsualDeleteParams {
- TypeAwareAllocationMode TypeAwareDelete = TypeAwareAllocationMode::No;
- bool DestroyingDelete = false;
- bool Size = false;
- AlignedAllocationMode Alignment = AlignedAllocationMode::No;
-};
-}
-
-static UsualDeleteParams getUsualDeleteParams(const FunctionDecl *FD) {
- UsualDeleteParams Params;
-
- const FunctionProtoType *FPT = FD->getType()->castAs<FunctionProtoType>();
- auto AI = FPT->param_type_begin(), AE = FPT->param_type_end();
-
- if (FD->isTypeAwareOperatorNewOrDelete()) {
- Params.TypeAwareDelete = TypeAwareAllocationMode::Yes;
- assert(AI != AE);
- ++AI;
- }
-
- // The first argument after the type-identity parameter (if any) is
- // always a void* (or C* for a destroying operator delete for class
- // type C).
- ++AI;
-
- // The next parameter may be a std::destroying_delete_t.
- if (FD->isDestroyingOperatorDelete()) {
- assert(!isTypeAwareAllocation(Params.TypeAwareDelete));
- Params.DestroyingDelete = true;
- assert(AI != AE);
- ++AI;
- }
-
- // Figure out what other parameters we should be implicitly passing.
- if (AI != AE && (*AI)->isIntegerType()) {
- Params.Size = true;
- ++AI;
- } else
- assert(!isTypeAwareAllocation(Params.TypeAwareDelete));
-
- if (AI != AE && (*AI)->isAlignValT()) {
- Params.Alignment = AlignedAllocationMode::Yes;
- ++AI;
- } else
- assert(!isTypeAwareAllocation(Params.TypeAwareDelete));
-
- assert(AI == AE && "unexpected usual deallocation function parameter");
- return Params;
-}
-
-namespace {
/// A cleanup to call the given 'operator delete' function upon abnormal
/// exit from a new expression. Templated on a traits type that deals with
/// ensuring that the arguments dominate the cleanup if necessary.
@@ -1505,7 +1453,7 @@ namespace {
} else {
// For a non-placement new-expression, 'operator delete' can take a
// size and/or an alignment if it has the right parameters.
- Params = getUsualDeleteParams(OperatorDelete);
+ Params = OperatorDelete->getUsualDeleteParams();
}
assert(!Params.DestroyingDelete &&
@@ -1838,7 +1786,7 @@ void CodeGenFunction::EmitDeleteCall(const FunctionDecl *DeleteFD,
const auto *DeleteFTy = DeleteFD->getType()->castAs<FunctionProtoType>();
CallArgList DeleteArgs;
- auto Params = getUsualDeleteParams(DeleteFD);
+ auto Params = DeleteFD->getUsualDeleteParams();
auto ParamTypeIt = DeleteFTy->param_type_begin();
std::optional<llvm::AllocaInst *> TagAlloca;
diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp
index b44dd9e..6407afc 100644
--- a/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/clang/lib/CodeGen/CGExprConstant.cpp
@@ -433,7 +433,7 @@ llvm::Constant *ConstantAggregateBuilder::buildFrom(
// All remaining elements must be the same type.
if (Elems[I]->getType() != CommonType ||
- Offset(I) % ElemSize != 0) {
+ !Offset(I).isMultipleOf(ElemSize)) {
CanEmitArray = false;
break;
}
diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp
index 60f30a1..dbcce9b 100644
--- a/clang/lib/CodeGen/CGObjCMac.cpp
+++ b/clang/lib/CodeGen/CGObjCMac.cpp
@@ -5367,7 +5367,7 @@ IvarLayoutBuilder::buildBitmap(CGObjCCommonMac &CGObjC,
// Ignore scan requests that don't start at an even multiple of the
// word size. We can't encode them.
- if ((beginOfScan % WordSize) != 0)
+ if (!beginOfScan.isMultipleOf(WordSize))
continue;
// Ignore scan requests that start before the instance start.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 75bde3f..8cda583 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1542,7 +1542,7 @@ static llvm::TargetRegionEntryInfo getEntryInfoFromPresumedLoc(
SourceManager &SM = CGM.getContext().getSourceManager();
PresumedLoc PLoc = SM.getPresumedLoc(BeginLoc);
- if (CGM.getFileSystem()->exists(PLoc.getFilename()))
+ if (!CGM.getFileSystem()->exists(PLoc.getFilename()))
PLoc = SM.getPresumedLoc(BeginLoc, /*UseLineDirectives=*/false);
return std::pair<std::string, uint64_t>(PLoc.getFilename(), PLoc.getLine());
diff --git a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
index 5f6136c..e9205c6 100644
--- a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
+++ b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
@@ -369,11 +369,11 @@ void CGRecordLowering::lowerUnion(bool isNonVirtualBaseType) {
appendPaddingBytes(LayoutSize - getSize(StorageType));
// Set packed if we need it.
const auto StorageAlignment = getAlignment(StorageType);
- assert((Layout.getSize() % StorageAlignment == 0 ||
- Layout.getDataSize() % StorageAlignment) &&
+ assert((Layout.getSize().isMultipleOf(StorageAlignment) ||
+ !Layout.getDataSize().isMultipleOf(StorageAlignment)) &&
"Union's standard layout and no_unique_address layout must agree on "
"packedness");
- if (Layout.getDataSize() % StorageAlignment)
+ if (!Layout.getDataSize().isMultipleOf(StorageAlignment))
Packed = true;
}
@@ -977,7 +977,7 @@ void CGRecordLowering::determinePacked(bool NVBaseType) {
continue;
// If any member falls at an offset that it not a multiple of its alignment,
// then the entire record must be packed.
- if (Member.Offset % getAlignment(Member.Data))
+ if (!Member.Offset.isMultipleOf(getAlignment(Member.Data)))
Packed = true;
if (Member.Offset < NVSize)
NVAlignment = std::max(NVAlignment, getAlignment(Member.Data));
@@ -985,12 +985,12 @@ void CGRecordLowering::determinePacked(bool NVBaseType) {
}
// If the size of the record (the capstone's offset) is not a multiple of the
// record's alignment, it must be packed.
- if (Members.back().Offset % Alignment)
+ if (!Members.back().Offset.isMultipleOf(Alignment))
Packed = true;
// If the non-virtual sub-object is not a multiple of the non-virtual
// sub-object's alignment, it must be packed. We cannot have a packed
// non-virtual sub-object and an unpacked complete object or vise versa.
- if (NVSize % NVAlignment)
+ if (!NVSize.isMultipleOf(NVAlignment))
Packed = true;
// Update the alignment of the sentinel.
if (!Packed)
diff --git a/clang/lib/Driver/ToolChains/HLSL.cpp b/clang/lib/Driver/ToolChains/HLSL.cpp
index f4858e4..2869549 100644
--- a/clang/lib/Driver/ToolChains/HLSL.cpp
+++ b/clang/lib/Driver/ToolChains/HLSL.cpp
@@ -64,7 +64,7 @@ bool isLegalShaderModel(Triple &T) {
} break;
case Triple::EnvironmentType::RootSignature:
VersionTuple MinVer(1, 0);
- VersionTuple MaxVer(1, 1);
+ VersionTuple MaxVer(1, 2);
return MinVer <= Version && Version <= MaxVer;
}
return false;
diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp
index ae09f70..238c5e2 100644
--- a/clang/lib/Lex/HeaderSearch.cpp
+++ b/clang/lib/Lex/HeaderSearch.cpp
@@ -2077,7 +2077,7 @@ std::string HeaderSearch::suggestPathToFileForDiagnostics(
llvm::SmallString<32> FilePath = File;
if (!WorkingDir.empty() && !path::is_absolute(FilePath))
- fs::make_absolute(WorkingDir, FilePath);
+ path::make_absolute(WorkingDir, FilePath);
// remove_dots switches to backslashes on windows as a side-effect!
// We always want to suggest forward slashes for includes.
// (not remove_dots(..., posix) as that misparses windows paths).
@@ -2091,7 +2091,7 @@ std::string HeaderSearch::suggestPathToFileForDiagnostics(
// `BestPrefixLength` accordingly.
auto CheckDir = [&](llvm::SmallString<32> Dir) -> bool {
if (!WorkingDir.empty() && !path::is_absolute(Dir))
- fs::make_absolute(WorkingDir, Dir);
+ path::make_absolute(WorkingDir, Dir);
path::remove_dots(Dir, /*remove_dot_dot=*/true);
for (auto NI = path::begin(File), NE = path::end(File),
DI = path::begin(Dir), DE = path::end(Dir);
diff --git a/clang/lib/Parse/ParseHLSLRootSignature.cpp b/clang/lib/Parse/ParseHLSLRootSignature.cpp
index 3b16efb..7be6eec 100644
--- a/clang/lib/Parse/ParseHLSLRootSignature.cpp
+++ b/clang/lib/Parse/ParseHLSLRootSignature.cpp
@@ -485,6 +485,9 @@ std::optional<StaticSampler> RootSignatureParser::parseStaticSampler() {
if (Params->Visibility.has_value())
Sampler.Visibility = Params->Visibility.value();
+ if (Params->Flags.has_value())
+ Sampler.Flags = Params->Flags.value();
+
return Sampler;
}
@@ -926,6 +929,20 @@ RootSignatureParser::parseStaticSamplerParams() {
if (!Visibility.has_value())
return std::nullopt;
Params.Visibility = Visibility;
+ } else if (tryConsumeExpectedToken(TokenKind::kw_flags)) {
+ // `flags` `=` STATIC_SAMPLE_FLAGS
+ if (Params.Flags.has_value()) {
+ reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
+ return std::nullopt;
+ }
+
+ if (consumeExpectedToken(TokenKind::pu_equal))
+ return std::nullopt;
+
+ auto Flags = parseStaticSamplerFlags(TokenKind::kw_flags);
+ if (!Flags.has_value())
+ return std::nullopt;
+ Params.Flags = Flags;
} else {
consumeNextToken(); // let diagnostic be at the start of invalid token
reportDiag(diag::err_hlsl_invalid_token)
@@ -1255,6 +1272,50 @@ RootSignatureParser::parseDescriptorRangeFlags(TokenKind Context) {
return Flags;
}
+std::optional<llvm::dxbc::StaticSamplerFlags>
+RootSignatureParser::parseStaticSamplerFlags(TokenKind Context) {
+ assert(CurToken.TokKind == TokenKind::pu_equal &&
+ "Expects to only be invoked starting at given keyword");
+
+ // Handle the edge-case of '0' to specify no flags set
+ if (tryConsumeExpectedToken(TokenKind::int_literal)) {
+ if (!verifyZeroFlag()) {
+ reportDiag(diag::err_hlsl_rootsig_non_zero_flag);
+ return std::nullopt;
+ }
+ return llvm::dxbc::StaticSamplerFlags::None;
+ }
+
+ TokenKind Expected[] = {
+#define STATIC_SAMPLER_FLAG_ENUM(NAME, LIT) TokenKind::en_##NAME,
+#include "clang/Lex/HLSLRootSignatureTokenKinds.def"
+ };
+
+ std::optional<llvm::dxbc::StaticSamplerFlags> Flags;
+
+ do {
+ if (tryConsumeExpectedToken(Expected)) {
+ switch (CurToken.TokKind) {
+#define STATIC_SAMPLER_FLAG_ENUM(NAME, LIT) \
+ case TokenKind::en_##NAME: \
+ Flags = maybeOrFlag<llvm::dxbc::StaticSamplerFlags>( \
+ Flags, llvm::dxbc::StaticSamplerFlags::NAME); \
+ break;
+#include "clang/Lex/HLSLRootSignatureTokenKinds.def"
+ default:
+ llvm_unreachable("Switch for consumed enum token was not provided");
+ }
+ } else {
+ consumeNextToken(); // consume token to point at invalid token
+ reportDiag(diag::err_hlsl_invalid_token)
+ << /*value=*/1 << /*value of*/ Context;
+ return std::nullopt;
+ }
+ } while (tryConsumeExpectedToken(TokenKind::pu_or));
+
+ return Flags;
+}
+
std::optional<uint32_t> RootSignatureParser::handleUIntLiteral() {
// Parse the numeric value and do semantic checks on its specification
clang::NumericLiteralParser Literal(
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 7b37e0b..8b9e132 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -15949,7 +15949,7 @@ void Sema::RefersToMemberWithReducedAlignment(
}
// Check if the synthesized offset fulfills the alignment.
- if (Offset % ExpectedAlignment != 0 ||
+ if (!Offset.isMultipleOf(ExpectedAlignment) ||
// It may fulfill the offset it but the effective alignment may still be
// lower than the expected expression alignment.
CompleteObjectAlignment < ExpectedAlignment) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
index e1f9a77..955b8d1 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
@@ -385,6 +385,10 @@ public:
if (RTC.isUnretained(RetValue->getType()))
return;
}
+ if (retainsRet && *retainsRet) {
+ CreateOrCopyFnCall.insert(RetValue);
+ return;
+ }
if (auto *CE = dyn_cast<CallExpr>(RetValue)) {
auto *Callee = CE->getDirectCallee();
if (!Callee || !isCreateOrCopyFunction(Callee))
diff --git a/clang/lib/StaticAnalyzer/Core/Store.cpp b/clang/lib/StaticAnalyzer/Core/Store.cpp
index 971e6bc..b609f36 100644
--- a/clang/lib/StaticAnalyzer/Core/Store.cpp
+++ b/clang/lib/StaticAnalyzer/Core/Store.cpp
@@ -210,7 +210,7 @@ std::optional<const MemRegion *> StoreManager::castRegion(const MemRegion *R,
// Is the offset a multiple of the size? If so, we can layer the
// ElementRegion (with elementType == PointeeTy) directly on top of
// the base region.
- if (off % pointeeTySize == 0) {
+ if (off.isMultipleOf(pointeeTySize)) {
newIndex = off / pointeeTySize;
newSuperR = baseR;
}
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
index d370bfd..66cf2688 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
@@ -31,7 +31,7 @@ public:
for (const auto &File : getDependencies()) {
CanonPath = File;
llvm::sys::path::remove_dots(CanonPath, /*remove_dot_dot=*/true);
- llvm::sys::fs::make_absolute(WorkingDirectory, CanonPath);
+ llvm::sys::path::make_absolute(WorkingDirectory, CanonPath);
C.handleFileDependency(CanonPath);
}
}
diff --git a/clang/test/AST/HLSL/RootSignature-Target-AST.hlsl b/clang/test/AST/HLSL/RootSignature-Target-AST.hlsl
index 91441e3..129ab70 100644
--- a/clang/test/AST/HLSL/RootSignature-Target-AST.hlsl
+++ b/clang/test/AST/HLSL/RootSignature-Target-AST.hlsl
@@ -1,9 +1,15 @@
// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-rootsignature -ast-dump \
+// RUN: -fdx-rootsignature-version=rootsig_1_0 \
+// RUN: -hlsl-entry EntryRootSig -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-V1_0
+
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-rootsignature -ast-dump \
+// RUN: -fdx-rootsignature-version=rootsig_1_1 \
// RUN: -hlsl-entry EntryRootSig -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-V1_1
// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-rootsignature -ast-dump \
-// RUN: -fdx-rootsignature-version=rootsig_1_0 \
-// RUN: -hlsl-entry EntryRootSig -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-V1_0
+// RUN: -fdx-rootsignature-version=rootsig_1_2 \
+// RUN: -hlsl-entry EntryRootSig -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-V1_2
+
// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-rootsignature -ast-dump \
// RUN: -D CmdRS='"UAV(u0)"'\
@@ -12,11 +18,13 @@
// CHECK: -HLSLRootSignatureDecl 0x{{.*}} {{.*}} implicit [[ENTRY_RS_DECL:__hlsl_rootsig_decl_\d*]]
// CHECK-V1_0-SAME: version: 1.0,
// CHECK-V1_1-SAME: version: 1.1,
+// CHECK-V1_2-SAME: version: 1.2,
// CHECK-SAME: RootElements{
// CHECK-SAME: RootCBV(b0,
// CHECK-SAME: space = 0, visibility = All,
// CHECK-V1_0-SAME: flags = DataVolatile
// CHECK-V1_1-SAME: flags = DataStaticWhileSetAtExecute
+// CHECK-V1_2-SAME: flags = DataStaticWhileSetAtExecute
// CHECK-SAME: )
// CHECK-SAME: }
#define EntryRootSig "CBV(b0)"
diff --git a/clang/test/AST/HLSL/RootSignatures-AST.hlsl b/clang/test/AST/HLSL/RootSignatures-AST.hlsl
index 32da1f1..0f0f3a5 100644
--- a/clang/test/AST/HLSL/RootSignatures-AST.hlsl
+++ b/clang/test/AST/HLSL/RootSignatures-AST.hlsl
@@ -6,6 +6,9 @@
// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -ast-dump \
// RUN: -fdx-rootsignature-version=rootsig_1_1 \
// RUN: -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-V1_1
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -ast-dump \
+// RUN: -fdx-rootsignature-version=rootsig_1_2 \
+// RUN: -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-V1_2
// This test ensures that the sample root signature is parsed without error and
// the Attr AST Node is created succesfully. If an invalid root signature was
@@ -31,6 +34,7 @@
// CHECK: -HLSLRootSignatureDecl 0x{{.*}} {{.*}} implicit [[SAMPLE_RS_DECL:__hlsl_rootsig_decl_\d*]]
// CHECK-V1_0: version: 1.0,
// CHECK-V1_1: version: 1.1,
+// CHECK-V1_2: version: 1.2,
// CHECK-SAME: RootElements{
// CHECK-SAME: RootFlags(AllowInputAssemblerInputLayout | DenyVertexShaderRootAccess),
// CHECK-SAME: RootCBV(b0,
@@ -62,6 +66,7 @@
// CHECK-SAME: s0, numDescriptors = 4, space = 1, offset = DescriptorTableOffsetAppend,
// CHECK-V1_0-SAME: flags = DescriptorsVolatile
// CHECK-V1_1-SAME: flags = None
+// CHECK-V1_2-SAME: flags = None
// CHECK-SAME: ),
// CHECK-SAME: DescriptorTable(
// CHECK-SAME: numClauses = 1, visibility = All
@@ -73,6 +78,7 @@
// CHECK-SAME: s1, filter = Anisotropic, addressU = Wrap, addressV = Wrap, addressW = Wrap,
// CHECK-SAME: mipLODBias = 0.000000e+00, maxAnisotropy = 16, comparisonFunc = LessEqual,
// CHECK-SAME: borderColor = OpaqueWhite, minLOD = 0.000000e+00, maxLOD = 3.402823e+38, space = 0, visibility = All
+// CHECK-SAME: flags = None
// CHECK-SAME: )}
// CHECK: -RootSignatureAttr 0x{{.*}} {{.*}} [[SAMPLE_RS_DECL]]
@@ -131,3 +137,24 @@ void same_rs_string_main() {}
// CHECK: -RootSignatureAttr 0x{{.*}} {{.*}} [[DIFF_RS_DECL]]
[RootSignature(SampleDifferentRS)]
void different_rs_string_main() {}
+
+#define SampleStaticSamplerRS \
+ "StaticSampler(s0, flags = NON_NORMALIZED_COORDINATES)"
+
+// Ensure that static samplers flags are correctly parsed in different versions
+
+// CHECK: -HLSLRootSignatureDecl 0x{{.*}} {{.*}} implicit [[DIFF_RS_DECL:__hlsl_rootsig_decl_\d*]]
+// CHECK-V1_0: version: 1.0,
+// CHECK-V1_1: version: 1.1,
+// CHECK-V1_2: version: 1.2,
+// CHECK-SAME: RootElements{
+// CHECK-SAME: StaticSampler(
+// CHECK-SAME: s0, filter = Anisotropic, addressU = Wrap, addressV = Wrap, addressW = Wrap,
+// CHECK-SAME: mipLODBias = 0.000000e+00, maxAnisotropy = 16, comparisonFunc = LessEqual,
+// CHECK-SAME: borderColor = OpaqueWhite, minLOD = 0.000000e+00, maxLOD = 3.402823e+38, space = 0, visibility = All
+// CHECK-SAME: flags = NonNormalizedCoordinates
+// CHECK-SAME: )}
+
+// CHECK: -RootSignatureAttr 0x{{.*}} {{.*}} [[DIFF_RS_DECL]]
+[RootSignature(SampleStaticSamplerRS)]
+void statoc_sampler_v12_main() {}
diff --git a/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h b/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h
index 39dee17..dacb713 100644
--- a/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h
@@ -17,6 +17,20 @@ template<typename T> typename remove_reference<T>::type&& move(T&& t);
#endif
+namespace std {
+
+template <bool, typename U = void> struct enable_if {
+};
+
+template <typename T> struct enable_if<true, T> {
+ using type = T;
+};
+
+template <bool value, class T = void>
+using enable_if_t = typename enable_if<value, T>::type;
+
+}
+
@class NSString;
@class NSArray;
@class NSMutableArray;
@@ -100,6 +114,7 @@ id CFBridgingRelease(CFTypeRef X) {
__attribute__((objc_root_class))
@interface NSObject
+ (instancetype) alloc;
++ (instancetype) allocWithZone:(NSZone *)zone;
+ (Class) class;
+ (Class) superclass;
- (instancetype) init;
@@ -232,6 +247,14 @@ template <typename T> struct RemovePointer<T*> {
typedef T Type;
};
+template <typename T> struct IsPointer {
+ static constexpr bool value = false;
+};
+
+template <typename T> struct IsPointer<T*> {
+ static constexpr bool value = true;
+};
+
template <typename T> struct RetainPtr {
using ValueType = typename RemovePointer<T>::Type;
using PtrType = ValueType*;
@@ -285,12 +308,23 @@ template <typename T> struct RetainPtr {
PtrType operator->() const { return t; }
T &operator*() const { return *t; }
RetainPtr &operator=(PtrType t);
- PtrType leakRef()
+
+ template <typename U = PtrType>
+ std::enable_if_t<IsPointer<U>::value, U> leakRef() CF_RETURNS_RETAINED
+ {
+ PtrType s = t;
+ t = nullptr;
+ return s;
+ }
+
+ template <typename U = PtrType>
+ std::enable_if_t<!IsPointer<U>::value, U> leakRef() NS_RETURNS_RETAINED
{
PtrType s = t;
t = nullptr;
return s;
}
+
operator PtrType() const { return t; }
operator bool() const { return t; }
diff --git a/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm b/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm
index 7699017..4570561 100644
--- a/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm
+++ b/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm
@@ -104,6 +104,14 @@ void basic_correct_arc() {
_number = value;
}
+- (id)copyWithZone:(NSZone *)zone {
+ auto copy = adoptNS([(SomeObj *)[SomeObj allocWithZone:zone] init]);
+ [copy setValue:_number];
+ [copy setNext:_next];
+ [copy setOther:_other];
+ return copy.leakRef();
+}
+
@end;
RetainPtr<CVPixelBufferRef> cf_out_argument() {
@@ -151,7 +159,7 @@ NSArray *makeArray() NS_RETURNS_RETAINED {
extern Class (*getNSArrayClass)();
NSArray *allocArrayInstance() NS_RETURNS_RETAINED {
- return [[getNSArrayClass() alloc] init];
+ return adoptNS([[getNSArrayClass() alloc] init]).leakRef();
}
extern int (*GetObj)(CF_RETURNS_RETAINED CFTypeRef* objOut);
@@ -294,7 +302,7 @@ RetainPtr<CFArrayRef> adopt_make_array() {
}
-(NSString *)make_string {
- return [[NSString alloc] initWithUTF8String:"hello"];
+ return adoptNS([[NSString alloc] initWithUTF8String:"hello"]).leakRef();
}
-(void)local_leak_string {
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index 2d58c38..ae69b24 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -1140,7 +1140,8 @@ void real_on_scalar_glvalue() {
// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["a"]
// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["b", init]
// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.float>, !cir.float
-// CIR: cir.store{{.*}} %[[TMP_A]], %[[B_ADDR]] : !cir.float, !cir.ptr<!cir.float>
+// CIR: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.float -> !cir.float
+// CIR: cir.store{{.*}} %[[A_REAL]], %[[B_ADDR]] : !cir.float, !cir.ptr<!cir.float>
// LLVM: %[[A_ADDR:.*]] = alloca float, i64 1, align 4
// LLVM: %[[B_ADDR:.*]] = alloca float, i64 1, align 4
@@ -1179,7 +1180,8 @@ void real_on_scalar_with_type_promotion() {
// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.f16, !cir.ptr<!cir.f16>, ["b", init]
// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.f16>, !cir.f16
// CIR: %[[TMP_A_F32:.*]] = cir.cast floating %[[TMP_A]] : !cir.f16 -> !cir.float
-// CIR: %[[TMP_A_F16:.*]] = cir.cast floating %[[TMP_A_F32]] : !cir.float -> !cir.f16
+// CIR: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A_F32]] : !cir.float -> !cir.float
+// CIR: %[[TMP_A_F16:.*]] = cir.cast floating %[[A_REAL]] : !cir.float -> !cir.f16
// CIR: cir.store{{.*}} %[[TMP_A_F16]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
// LLVM: %[[A_ADDR:.*]] = alloca half, i64 1, align 2
@@ -1248,7 +1250,8 @@ void real_on_scalar_from_real_with_type_promotion() {
// CIR: %[[A_IMAG_F32:.*]] = cir.cast floating %[[A_IMAG]] : !cir.f16 -> !cir.float
// CIR: %[[A_COMPLEX_F32:.*]] = cir.complex.create %[[A_REAL_F32]], %[[A_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
// CIR: %[[A_REAL_F32:.*]] = cir.complex.real %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
-// CIR: %[[A_REAL_F16:.*]] = cir.cast floating %[[A_REAL_F32]] : !cir.float -> !cir.f16
+// CIR: %[[A_REAL:.*]] = cir.complex.real %[[A_REAL_F32]] : !cir.float -> !cir.float
+// CIR: %[[A_REAL_F16:.*]] = cir.cast floating %[[A_REAL]] : !cir.float -> !cir.f16
// CIR: cir.store{{.*}} %[[A_REAL_F16]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
// LLVM: %[[A_ADDR:.*]] = alloca { half, half }, i64 1, align 2
@@ -1285,8 +1288,9 @@ void real_on_scalar_from_imag_with_type_promotion() {
// CIR: %[[A_IMAG_F32:.*]] = cir.cast floating %[[A_IMAG]] : !cir.f16 -> !cir.float
// CIR: %[[A_COMPLEX_F32:.*]] = cir.complex.create %[[A_REAL_F32]], %[[A_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
// CIR: %[[A_IMAG_F32:.*]] = cir.complex.imag %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
-// CIR: %[[A_IMAG_F16:.*]] = cir.cast floating %[[A_IMAG_F32]] : !cir.float -> !cir.f16
-// CIR: cir.store{{.*}} %[[A_IMAG_F16]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
+// CIR: %[[A_REAL_F32:.*]] = cir.complex.real %[[A_IMAG_F32]] : !cir.float -> !cir.float
+// CIR: %[[A_REAL_F16:.*]] = cir.cast floating %[[A_REAL_F32]] : !cir.float -> !cir.f16
+// CIR: cir.store{{.*}} %[[A_REAL_F16]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
// LLVM: %[[A_ADDR:.*]] = alloca { half, half }, i64 1, align 2
// LLVM: %[[B_ADDR]] = alloca half, i64 1, align 2
diff --git a/clang/test/CodeGenHLSL/RootSignature.hlsl b/clang/test/CodeGenHLSL/RootSignature.hlsl
index bc40bdd..eaff3a9 100644
--- a/clang/test/CodeGenHLSL/RootSignature.hlsl
+++ b/clang/test/CodeGenHLSL/RootSignature.hlsl
@@ -82,8 +82,8 @@ void RootDescriptorsEntry() {}
// checking minLOD, maxLOD
// CHECK-SAME: float -1.280000e+02, float 1.280000e+02,
-// checking register, space and visibility
-// CHECK-SAME: i32 42, i32 0, i32 0}
+// checking register, space, visibility and flag
+// CHECK-SAME: i32 42, i32 0, i32 0, i32 1}
#define SampleStaticSampler \
"StaticSampler(s42, " \
@@ -96,6 +96,7 @@ void RootDescriptorsEntry() {}
" borderColor = STATIC_BORDER_COLOR_OPAQUE_WHITE, " \
" minLOD = -128.f, maxLOD = 128.f, " \
" space = 0, visibility = SHADER_VISIBILITY_ALL, " \
+ " flags = UINT_BORDER_COLOR" \
")"
[shader("compute"), RootSignature(SampleStaticSampler)]
[numthreads(1,1,1)]
diff --git a/clang/test/OpenMP/amdgcn_save_temps.c b/clang/test/OpenMP/amdgcn_save_temps.c
index ebf0d60..d838bb1 100644
--- a/clang/test/OpenMP/amdgcn_save_temps.c
+++ b/clang/test/OpenMP/amdgcn_save_temps.c
@@ -1,8 +1,6 @@
// REQUIRES: amdgpu-registered-target
-// XFAIL: *
-
// RUN: %clang_cc1 -E -fopenmp -x c -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -save-temps=cwd %s -o %t-openmp-amdgcn-amd-amdhsa-gfx90a.i
// RUN: %clang_cc1 -fopenmp -x c -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -save-temps=cwd -emit-llvm-bc %s -o %t-x86_64-unknown-unknown.bc
// RUN: %clang_cc1 -fopenmp -x c -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -save-temps=cwd -emit-llvm -fopenmp-is-target-device -x cpp-output %t-openmp-amdgcn-amd-amdhsa-gfx90a.i -fopenmp-host-ir-file-path %t-x86_64-unknown-unknown.bc -o - | FileCheck %s
diff --git a/clang/test/SemaHLSL/RootSignature-err.hlsl b/clang/test/SemaHLSL/RootSignature-err.hlsl
index 89c684c..debeafe 100644
--- a/clang/test/SemaHLSL/RootSignature-err.hlsl
+++ b/clang/test/SemaHLSL/RootSignature-err.hlsl
@@ -191,6 +191,10 @@ void basic_validation_5() {}
[RootSignature("StaticSampler(s0, mipLODBias = 15.990001)")]
void basic_validation_6() {}
+// expected-error@+1 {{invalid value of flags}}
+[RootSignature("StaticSampler(s0, flags = FLAG_TYPO)")]
+void basic_validation_7() {}
+
// expected-error@+1 {{sampler and non-sampler resource mixed in descriptor table}}
[RootSignature("DescriptorTable(Sampler(s0), CBV(b0))")]
void mixed_resource_table() {}
diff --git a/clang/test/SemaHLSL/RootSignature-flags-err.hlsl b/clang/test/SemaHLSL/RootSignature-flags-err.hlsl
index 9449d33..c79e692 100644
--- a/clang/test/SemaHLSL/RootSignature-flags-err.hlsl
+++ b/clang/test/SemaHLSL/RootSignature-flags-err.hlsl
@@ -2,7 +2,8 @@
// RUN: -fdx-rootsignature-version=rootsig_1_0 %s -verify=v10
// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -fsyntax-only \
// RUN: -fdx-rootsignature-version=rootsig_1_1 %s -verify=v11
-
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -fsyntax-only \
+// RUN: -fdx-rootsignature-version=rootsig_1_2 %s -verify=v12
// Root Descriptor Flags:
// v10-error@+1 {{invalid flags for version 1.0}}
@@ -13,8 +14,9 @@ void bad_root_descriptor_flags_0() {}
[RootSignature("CBV(b0, flags = DATA_STATIC_WHILE_SET_AT_EXECUTE)")]
void bad_root_descriptor_flags_1() {}
-// v10-error@+2 {{invalid flags for version 1.0}}
-// v11-error@+1 {{invalid flags for version 1.1}}
+// v10-error@+3 {{invalid flags for version 1.0}}
+// v11-error@+2 {{invalid flags for version 1.1}}
+// v12-error@+1 {{invalid flags for version 1.2}}
[RootSignature("CBV(b0, flags = DATA_STATIC | DATA_VOLATILE)")]
void bad_root_descriptor_flags_2() {}
@@ -40,18 +42,20 @@ void bad_descriptor_range_flags_3() {}
[RootSignature("DescriptorTable(CBV(b0, flags = DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS))")]
void bad_descriptor_range_flags_4() {}
-// v10-error@+2 {{invalid flags for version 1.0}}
-// v11-error@+1 {{invalid flags for version 1.1}}
+// v10-error@+3 {{invalid flags for version 1.0}}
+// v11-error@+2 {{invalid flags for version 1.1}}
+// v12-error@+1 {{invalid flags for version 1.2}}
[RootSignature("DescriptorTable(CBV(b0, flags = DATA_STATIC | DATA_STATIC_WHILE_SET_AT_EXECUTE))")]
void bad_descriptor_range_flags_5() {}
-// v10-error@+2 {{invalid flags for version 1.0}}
-// v11-error@+1 {{invalid flags for version 1.1}}
+// v10-error@+3 {{invalid flags for version 1.0}}
+// v11-error@+2 {{invalid flags for version 1.1}}
+// v12-error@+1 {{invalid flags for version 1.2}}
[RootSignature("DescriptorTable(CBV(b0, flags = DESCRIPTORS_VOLATILE | DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS))")]
void bad_descriptor_range_flags_6() {}
-// v10-error@+2 {{invalid flags for version 1.0}}
-// v11-error@+1 {{invalid flags for version 1.1}}
+// v10-error@+3 {{invalid flags for version 1.0}}
+// v11-error@+2 {{invalid flags for version 1.1}}
+// v12-error@+1 {{invalid flags for version 1.2}}
[RootSignature("DescriptorTable(CBV(b0, flags = DESCRIPTORS_VOLATILE | DATA_STATIC))")]
void bad_descriptor_range_flags_7() {}
-
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index be658aca..1419b8c 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -608,10 +608,10 @@ Expected<StringRef> linkDevice(ArrayRef<StringRef> InputFiles,
Error containerizeRawImage(std::unique_ptr<MemoryBuffer> &Img, OffloadKind Kind,
const ArgList &Args) {
llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
- if (Kind != OFK_OpenMP || !Triple.isSPIRV() ||
- Triple.getVendor() != llvm::Triple::Intel)
- return Error::success();
- return offloading::intel::containerizeOpenMPSPIRVImage(Img);
+ if (Kind == OFK_OpenMP && Triple.isSPIRV() &&
+ Triple.getVendor() == llvm::Triple::Intel)
+ return offloading::intel::containerizeOpenMPSPIRVImage(Img);
+ return Error::success();
}
Expected<StringRef> writeOffloadFile(const OffloadFile &File) {
diff --git a/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp b/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp
index 8dd993f..594c79a 100644
--- a/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp
+++ b/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp
@@ -27,22 +27,16 @@
#include "llvm/LTO/LTO.h"
#include "llvm/Linker/Linker.h"
#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Object/Archive.h"
-#include "llvm/Object/ArchiveWriter.h"
#include "llvm/Object/Binary.h"
-#include "llvm/Object/ELFObjectFile.h"
#include "llvm/Object/IRObjectFile.h"
-#include "llvm/Object/ObjectFile.h"
#include "llvm/Object/OffloadBinary.h"
#include "llvm/Option/ArgList.h"
#include "llvm/Option/OptTable.h"
#include "llvm/Option/Option.h"
-#include "llvm/Remarks/HotnessThresholdParser.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FileOutputBuffer.h"
#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Program.h"
#include "llvm/Support/Signals.h"
diff --git a/clang/unittests/Frontend/CompilerInstanceTest.cpp b/clang/unittests/Frontend/CompilerInstanceTest.cpp
index 36cac5a..cd3fefa 100644
--- a/clang/unittests/Frontend/CompilerInstanceTest.cpp
+++ b/clang/unittests/Frontend/CompilerInstanceTest.cpp
@@ -33,7 +33,7 @@ TEST(CompilerInstance, DefaultVFSOverlayFromInvocation) {
SmallString<256> CurrentPath;
sys::fs::current_path(CurrentPath);
- sys::fs::make_absolute(CurrentPath, FileName);
+ sys::path::make_absolute(CurrentPath, FileName);
// Mount the VFS file itself on the path 'virtual.file'. Makes this test
// a bit shorter than creating a new dummy file just for this purpose.
diff --git a/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp b/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp
index 01f8d4f..82f1968 100644
--- a/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp
+++ b/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp
@@ -226,6 +226,9 @@ TEST_F(LexHLSLRootSignatureTest, ValidLexAllTokensTest) {
STATIC_BORDER_COLOR_OPAQUE_WHITE
STATIC_BORDER_COLOR_OPAQUE_BLACK_UINT
STATIC_BORDER_COLOR_OPAQUE_WHITE_UINT
+
+ UINT_BORDER_COLOR
+ NON_NORMALIZED_COORDINATES
)cc";
hlsl::RootSignatureLexer Lexer(Source);
diff --git a/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp b/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp
index 9b9f5dd..f7e9d2d 100644
--- a/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp
+++ b/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp
@@ -263,7 +263,8 @@ TEST_F(ParseHLSLRootSignatureTest, ValidParseStaticSamplerTest) {
filter = FILTER_MAXIMUM_MIN_POINT_MAG_LINEAR_MIP_POINT,
maxLOD = 9000, addressU = TEXTURE_ADDRESS_MIRROR,
comparisonFunc = COMPARISON_NOT_EQUAL,
- borderColor = STATIC_BORDER_COLOR_OPAQUE_BLACK_UINT
+ borderColor = STATIC_BORDER_COLOR_OPAQUE_BLACK_UINT,
+ flags = 0
)
)cc";
@@ -336,6 +337,37 @@ TEST_F(ParseHLSLRootSignatureTest, ValidParseStaticSamplerTest) {
ASSERT_TRUE(Consumer->isSatisfied());
}
+TEST_F(ParseHLSLRootSignatureTest, ValidStaticSamplerFlagsTest) {
+ const llvm::StringLiteral Source = R"cc(
+ StaticSampler(s0, flags = UINT_BORDER_COLOR | NON_NORMALIZED_COORDINATES)
+ )cc";
+
+ auto Ctx = createMinimalASTContext();
+ StringLiteral *Signature = wrapSource(Ctx, Source);
+
+ TrivialModuleLoader ModLoader;
+ auto PP = createPP(Source, ModLoader);
+
+ hlsl::RootSignatureParser Parser(RootSignatureVersion::V1_1, Signature, *PP);
+
+ // Test no diagnostics produced
+ Consumer->setNoDiag();
+
+ ASSERT_FALSE(Parser.parse());
+
+ auto Elements = Parser.getElements();
+ ASSERT_EQ(Elements.size(), 1u);
+
+ RootElement Elem = Elements[0].getElement();
+ ASSERT_TRUE(std::holds_alternative<StaticSampler>(Elem));
+ auto ValidStaticSamplerFlags =
+ llvm::dxbc::StaticSamplerFlags::NonNormalizedCoordinates |
+ llvm::dxbc::StaticSamplerFlags::UintBorderColor;
+ ASSERT_EQ(std::get<StaticSampler>(Elem).Flags, ValidStaticSamplerFlags);
+
+ ASSERT_TRUE(Consumer->isSatisfied());
+}
+
TEST_F(ParseHLSLRootSignatureTest, ValidParseFloatsTest) {
const llvm::StringLiteral Source = R"cc(
StaticSampler(s0, mipLODBias = 0),
diff --git a/clang/utils/TableGen/RISCVVEmitter.cpp b/clang/utils/TableGen/RISCVVEmitter.cpp
index f73b0aecc..74f29ac 100644
--- a/clang/utils/TableGen/RISCVVEmitter.cpp
+++ b/clang/utils/TableGen/RISCVVEmitter.cpp
@@ -133,28 +133,20 @@ static BasicType ParseBasicType(char c) {
switch (c) {
case 'c':
return BasicType::Int8;
- break;
case 's':
return BasicType::Int16;
- break;
case 'i':
return BasicType::Int32;
- break;
case 'l':
return BasicType::Int64;
- break;
case 'x':
return BasicType::Float16;
- break;
case 'f':
return BasicType::Float32;
- break;
case 'd':
return BasicType::Float64;
- break;
case 'y':
return BasicType::BFloat16;
- break;
default:
return BasicType::Unknown;
}
diff --git a/clang/utils/perf-training/CMakeLists.txt b/clang/utils/perf-training/CMakeLists.txt
index 1d7bb78..2cd4c4c 100644
--- a/clang/utils/perf-training/CMakeLists.txt
+++ b/clang/utils/perf-training/CMakeLists.txt
@@ -6,6 +6,10 @@ set(CLANG_PGO_TRAINING_DATA "${CMAKE_CURRENT_SOURCE_DIR}" CACHE PATH
set(CLANG_PGO_TRAINING_DATA_SOURCE_DIR OFF CACHE STRING "Path to source directory containing cmake project with source files to use for generating pgo data")
set(CLANG_PGO_TRAINING_DEPS "" CACHE STRING "Extra dependencies needed to build the PGO training data.")
+add_custom_target(clear-perf-data
+ COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} perf.data
+ COMMENT "Clearing old perf data")
+
option(CLANG_PGO_TRAINING_USE_LLVM_BUILD "Use LLVM build for generating PGO data" ON)
llvm_canonicalize_cmake_booleans(
@@ -21,7 +25,7 @@ if(LLVM_BUILD_INSTRUMENTED)
add_lit_testsuite(generate-profraw "Generating clang PGO data"
${CMAKE_CURRENT_BINARY_DIR}/pgo-data/
EXCLUDE_FROM_CHECK_ALL
- DEPENDS clear-profraw
+ DEPENDS clear-profraw clang
)
add_custom_target(clear-profraw
@@ -55,6 +59,32 @@ if(LLVM_BUILD_INSTRUMENTED)
USE_TOOLCHAIN EXLUDE_FROM_ALL NO_INSTALL DEPENDS generate-profraw)
add_dependencies(generate-profdata generate-profraw-external)
endif()
+
+ if(NOT LLVM_PROFGEN)
+ find_program(LLVM_PROFGEN llvm-profgen)
+ endif()
+
+ if(NOT LLVM_PROFGEN)
+ message(STATUS "To enable converting CSSPGO samples LLVM_PROFGEN has to point to llvm-profgen")
+ elseif(NOT CLANG_PGO_TRAINING_DATA_SOURCE_DIR)
+ message(STATUS "CLANG_PGO_TRAINING_DATA_SOURCE_DIR must be set to collect CSSPGO samples")
+ else()
+ set(PERF_HELPER "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py)
+ set(CLANG_SPROFDATA ${CMAKE_CURRENT_BINARY_DIR}/clang.sprofdata)
+ add_custom_command(
+ OUTPUT ${CLANG_SPROFDATA}
+ # Execute generate-profraw-external under perf
+ COMMAND ${PERF_HELPER} perf --csspgo -- ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} --target generate-profraw-external
+ # Convert perf profile into profraw
+ COMMAND ${PERF_HELPER} perf2prof ${LLVM_PROFGEN} $<TARGET_FILE:clang> ${CMAKE_CURRENT_BINARY_DIR}
+ # Merge profdata
+ COMMAND ${PERF_HELPER} merge --sample ${LLVM_PROFDATA} ${CLANG_SPROFDATA} ${CMAKE_CURRENT_BINARY_DIR}
+ DEPENDS clang ${CLANG_PGO_TRAINING_DEPS} clear-perf-data generate-profraw-external-clean
+ VERBATIM
+ USES_TERMINAL
+ )
+ add_custom_target(generate-sprofdata DEPENDS ${CLANG_SPROFDATA})
+ endif()
endif()
endif()
@@ -104,8 +134,4 @@ if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} fdata
COMMENT "Clearing old BOLT fdata")
- add_custom_target(clear-perf-data
- COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} perf.data
- COMMENT "Clearing old perf data")
-
endif()
diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py
index ab4491d..1c7904e 100644
--- a/clang/utils/perf-training/perf-helper.py
+++ b/clang/utils/perf-training/perf-helper.py
@@ -45,14 +45,22 @@ def clean(args):
def merge(args):
- if len(args) < 3:
- print(
- "Usage: %s merge <llvm-profdata> <output> <paths>\n" % __file__
- + "\tMerges all profraw files from path into output."
- )
- return 1
- cmd = [args[0], "merge", "-o", args[1]]
- for path in args[2:]:
+ parser = argparse.ArgumentParser(
+ prog="perf-helper merge",
+ description="Merges all profraw files from path(s) into output",
+ )
+ parser.add_argument("profdata", help="Path to llvm-profdata tool")
+ parser.add_argument("output", help="Output filename")
+ parser.add_argument(
+ "paths", nargs="+", help="Folder(s) containing input profraw files"
+ )
+ parser.add_argument("--sample", action="store_true", help="Sample profile")
+ opts = parser.parse_args(args)
+
+ cmd = [opts.profdata, "merge", "-o", opts.output]
+ if opts.sample:
+ cmd += ["--sample"]
+ for path in opts.paths:
cmd.extend(findFilesWithExtension(path, "profraw"))
subprocess.check_call(cmd)
return 0
@@ -73,25 +81,30 @@ def merge_fdata(args):
def perf(args):
parser = argparse.ArgumentParser(
- prog="perf-helper perf", description="perf wrapper for BOLT profile collection"
+ prog="perf-helper perf",
+ description="perf wrapper for BOLT/CSSPGO profile collection",
)
parser.add_argument(
"--lbr", action="store_true", help="Use perf with branch stacks"
)
+ parser.add_argument("--csspgo", action="store_true", help="Enable CSSPGO flags")
parser.add_argument("cmd", nargs=argparse.REMAINDER, help="")
opts = parser.parse_args(args)
cmd = opts.cmd[1:]
+ event = "br_inst_retired.near_taken:uppp" if opts.csspgo else "cycles:u"
perf_args = [
"perf",
"record",
- "--event=cycles:u",
+ f"--event={event}",
"--freq=max",
"--output=%d.perf.data" % os.getpid(),
]
- if opts.lbr:
+ if opts.lbr or opts.csspgo:
perf_args += ["--branch-filter=any,u"]
+ if opts.csspgo:
+ perf_args += ["-g", "--call-graph=fp"]
perf_args.extend(cmd)
start_time = time.time()
@@ -127,6 +140,30 @@ def perf2bolt(args):
return 0
+def perf2prof(args):
+ parser = argparse.ArgumentParser(
+ prog="perf-helper perf2prof",
+ description="perf to CSSPGO prof conversion wrapper",
+ )
+ parser.add_argument("profgen", help="Path to llvm-profgen binary")
+ parser.add_argument("binary", help="Input binary")
+ parser.add_argument("paths", nargs="+", help="Path containing perf.data files")
+ opts = parser.parse_args(args)
+
+ profgen_args = [opts.profgen, f"--binary={opts.binary}"]
+ for path in opts.paths:
+ for filename in findFilesWithExtension(path, "perf.data"):
+ subprocess.run(
+ [
+ *profgen_args,
+ f"--perfdata={filename}",
+ f"--output={filename}.profraw",
+ ],
+ check=True,
+ )
+ return 0
+
+
def dtrace(args):
parser = argparse.ArgumentParser(
prog="perf-helper dtrace",
@@ -660,7 +697,10 @@ def bolt_optimize(args):
process.check_returncode()
if opts.method in ["PERF", "LBR"]:
- perf2bolt([opts.bolt, opts.perf_training_binary_dir, opts.input])
+ args = [opts.bolt, opts.perf_training_binary_dir, opts.input]
+ if opts.method == "LBR":
+ args.extend("--lbr")
+ perf2bolt(args)
merge_fdata([opts.merge_fdata, opts.fdata, opts.perf_training_binary_dir])
@@ -707,6 +747,7 @@ commands = {
"merge-fdata": merge_fdata,
"perf": perf,
"perf2bolt": perf2bolt,
+ "perf2prof": perf2prof,
}
diff --git a/flang-rt/lib/runtime/derived-api.cpp b/flang-rt/lib/runtime/derived-api.cpp
index bb08e03..fe68682 100644
--- a/flang-rt/lib/runtime/derived-api.cpp
+++ b/flang-rt/lib/runtime/derived-api.cpp
@@ -118,14 +118,26 @@ bool RTDEF(SameTypeAs)(const Descriptor &a, const Descriptor &b) {
}
bool RTDEF(ExtendsTypeOf)(const Descriptor &a, const Descriptor &mold) {
+ // The wording of the standard indicates null or unallocated checks take
+ // precedence over the extension checks which take precedence over any
+ // compiler specific behavior.
+ // F'23 16.9.86 p 5
+ // If MOLD is unlimited polymorphic and is either a disassociated pointer or
+ // unallocated allocatable variable, the result is true;
auto aType{a.raw().type};
auto moldType{mold.raw().type};
if ((aType != CFI_type_struct && aType != CFI_type_other) ||
(moldType != CFI_type_struct && moldType != CFI_type_other)) {
- // If either type is intrinsic, they must match.
- return aType == moldType;
- } else if (const typeInfo::DerivedType *
- derivedTypeMold{GetDerivedType(mold)}) {
+ if (!mold.IsAllocated()) {
+ return true;
+ } else if (!a.IsAllocated()) {
+ return false;
+ } else {
+ // If either type is intrinsic and not a pointer or allocatable
+ // then they must match.
+ return aType == moldType;
+ }
+ } else if (const auto *derivedTypeMold{GetDerivedType(mold)}) {
// If A is unlimited polymorphic and is either a disassociated pointer or
// unallocated allocatable, the result is false.
// Otherwise if the dynamic type of A or MOLD is extensible, the result is
diff --git a/flang/include/flang/Semantics/openmp-utils.h b/flang/include/flang/Semantics/openmp-utils.h
index 08b6716..2954a1c 100644
--- a/flang/include/flang/Semantics/openmp-utils.h
+++ b/flang/include/flang/Semantics/openmp-utils.h
@@ -37,6 +37,8 @@ template <typename T, typename U = std::remove_const_t<T>> U AsRvalue(T &t) {
template <typename T> T &&AsRvalue(T &&t) { return std::move(t); }
+const Scope &GetScopingUnit(const Scope &scope);
+
// There is no consistent way to get the source of an ActionStmt, but there
// is "source" in Statement<T>. This structure keeps the ActionStmt with the
// extracted source for further use.
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index e224e06..1f059f747 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -1361,9 +1361,19 @@ void OmpStructureChecker::Enter(const parser::OpenMPDeclareSimdConstruct &x) {
return;
}
+ auto isValidSymbol{[](const Symbol *sym) {
+ if (IsProcedure(*sym) || IsFunction(*sym)) {
+ return true;
+ }
+ if (const Symbol *owner{GetScopingUnit(sym->owner()).symbol()}) {
+ return IsProcedure(*owner) || IsFunction(*owner);
+ }
+ return false;
+ }};
+
const parser::OmpArgument &arg{args.v.front()};
if (auto *sym{GetArgumentSymbol(arg)}) {
- if (!IsProcedure(*sym) && !IsFunction(*sym)) {
+ if (!isValidSymbol(sym)) {
auto &msg{context_.Say(arg.source,
"The name '%s' should refer to a procedure"_err_en_US, sym->name())};
if (sym->test(Symbol::Flag::Implicit)) {
diff --git a/flang/lib/Semantics/openmp-utils.cpp b/flang/lib/Semantics/openmp-utils.cpp
index 35b7718..a8ec4d6 100644
--- a/flang/lib/Semantics/openmp-utils.cpp
+++ b/flang/lib/Semantics/openmp-utils.cpp
@@ -41,6 +41,24 @@
namespace Fortran::semantics::omp {
using namespace Fortran::parser::omp;
+const Scope &GetScopingUnit(const Scope &scope) {
+ const Scope *iter{&scope};
+ for (; !iter->IsTopLevel(); iter = &iter->parent()) {
+ switch (iter->kind()) {
+ case Scope::Kind::BlockConstruct:
+ case Scope::Kind::BlockData:
+ case Scope::Kind::DerivedType:
+ case Scope::Kind::MainProgram:
+ case Scope::Kind::Module:
+ case Scope::Kind::Subprogram:
+ return *iter;
+ default:
+ break;
+ }
+ }
+ return *iter;
+}
+
SourcedActionStmt GetActionStmt(const parser::ExecutionPartConstruct *x) {
if (x == nullptr) {
return SourcedActionStmt{};
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index bd7b8ac..b1eaaa8 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -379,24 +379,6 @@ public:
explicit OmpAttributeVisitor(SemanticsContext &context)
: DirectiveAttributeVisitor(context) {}
- static const Scope &scopingUnit(const Scope &scope) {
- const Scope *iter{&scope};
- for (; !iter->IsTopLevel(); iter = &iter->parent()) {
- switch (iter->kind()) {
- case Scope::Kind::BlockConstruct:
- case Scope::Kind::BlockData:
- case Scope::Kind::DerivedType:
- case Scope::Kind::MainProgram:
- case Scope::Kind::Module:
- case Scope::Kind::Subprogram:
- return *iter;
- default:
- break;
- }
- }
- return *iter;
- }
-
template <typename A> void Walk(const A &x) { parser::Walk(x, *this); }
template <typename A> bool Pre(const A &) { return true; }
template <typename A> void Post(const A &) {}
@@ -2303,14 +2285,17 @@ void OmpAttributeVisitor::CheckPerfectNestAndRectangularLoop(
}
auto checkPerfectNest = [&, this]() {
- auto blockSize = block.size();
- if (blockSize <= 1)
+ if (block.empty())
return;
+ auto last = block.end();
+ --last;
- if (parser::Unwrap<parser::ContinueStmt>(x))
- blockSize -= 1;
+ // A trailing CONTINUE is not considered part of the loop body
+ if (parser::Unwrap<parser::ContinueStmt>(*last))
+ --last;
- if (blockSize <= 1)
+ // In a perfectly nested loop, the nested loop must be the only statement
+ if (last == block.begin())
return;
// Non-perfectly nested loop
@@ -3086,8 +3071,8 @@ void OmpAttributeVisitor::ResolveOmpDesignator(
checkScope = ompFlag == Symbol::Flag::OmpExecutableAllocateDirective;
}
if (checkScope) {
- if (scopingUnit(GetContext().scope) !=
- scopingUnit(symbol->GetUltimate().owner())) {
+ if (omp::GetScopingUnit(GetContext().scope) !=
+ omp::GetScopingUnit(symbol->GetUltimate().owner())) {
context_.Say(designator.source, // 2.15.3
"List items must be declared in the same scoping unit in which the %s directive appears"_err_en_US,
parser::ToUpperCaseLetters(
diff --git a/flang/test/Lower/OpenMP/wsloop-collapse-continue.f90 b/flang/test/Lower/OpenMP/wsloop-collapse-continue.f90
new file mode 100644
index 0000000..fea7a8b
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-collapse-continue.f90
@@ -0,0 +1,19 @@
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+program wsloop_collapse_continue
+ integer i, j
+
+! CHECK: omp.wsloop {{.*}} {
+! CHECK: omp.loop_nest ({{.*}}) : i32 = ({{.*}}) to ({{.*}}) inclusive step ({{.*}}) collapse(2) {
+ !$omp do collapse(2)
+ do 50 i = 1, 42
+ do 51 j = 1, 84
+! CHECK: fir.call @_FortranAioOutputInteger32(
+ print *, i
+! CHECK: fir.call @_FortranAioOutputInteger32(
+ print *, j
+ 51 continue
+ 50 continue
+ !$omp end do
+
+end program wsloop_collapse_continue
diff --git a/flang/test/Semantics/OpenMP/declare-simd.f90 b/flang/test/Semantics/OpenMP/declare-simd.f90
index ceed2c2..bb259b8 100644
--- a/flang/test/Semantics/OpenMP/declare-simd.f90
+++ b/flang/test/Semantics/OpenMP/declare-simd.f90
@@ -19,4 +19,9 @@ end
subroutine f01
end
+integer function f02
+!Ok, expect no diagnostics
+!$omp declare_simd(f02)
+end
+
end module
diff --git a/flang/test/Semantics/OpenMP/do08.f90 b/flang/test/Semantics/OpenMP/do08.f90
index bb3c1d0c..5143dff 100644
--- a/flang/test/Semantics/OpenMP/do08.f90
+++ b/flang/test/Semantics/OpenMP/do08.f90
@@ -61,7 +61,6 @@ program omp
!$omp end do
- !ERROR: Canonical loop nest must be perfectly nested.
!ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
!$omp do collapse(3)
do 60 i=2,200,2
diff --git a/flang/test/Semantics/OpenMP/do13.f90 b/flang/test/Semantics/OpenMP/do13.f90
index 8f7844f..6e9d1dd 100644
--- a/flang/test/Semantics/OpenMP/do13.f90
+++ b/flang/test/Semantics/OpenMP/do13.f90
@@ -59,7 +59,6 @@ program omp
!$omp end do
- !ERROR: Canonical loop nest must be perfectly nested.
!ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
!$omp do collapse(3)
do 60 i=1,10
diff --git a/libc/src/__support/macros/attributes.h b/libc/src/__support/macros/attributes.h
index 145aa3b..d5ff028 100644
--- a/libc/src/__support/macros/attributes.h
+++ b/libc/src/__support/macros/attributes.h
@@ -81,4 +81,14 @@ LIBC_THREAD_MODE_EXTERNAL.
#define LIBC_HAS_VECTOR_TYPE 0
#endif
+#if __has_attribute(no_sanitize)
+// Disable regular and hardware-supported ASan for functions that may
+// intentionally make out-of-bounds access. Disable TSan as well, as it detects
+// out-of-bounds accesses to heap memory.
+#define LIBC_NO_SANITIZE_OOB_ACCESS \
+ __attribute__((no_sanitize("address", "hwaddress", "thread")))
+#else
+#define LIBC_NO_SANITIZE_OOB_ACCESS
+#endif
+
#endif // LLVM_LIBC_SRC___SUPPORT_MACROS_ATTRIBUTES_H
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index b8cdb2a7..83c9564 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -22,6 +22,7 @@ add_header_library(
libc.src.__support.CPP.type_traits
libc.src.__support.CPP.simd
libc.src.__support.common
+ libc.src.__support.macros.attributes
libc.src.string.memory_utils.inline_memcpy
${string_config_options}
)
diff --git a/libc/src/string/memory_utils/aarch64/inline_strlen.h b/libc/src/string/memory_utils/aarch64/inline_strlen.h
index 36fd1aa..87f5ccd 100644
--- a/libc/src/string/memory_utils/aarch64/inline_strlen.h
+++ b/libc/src/string/memory_utils/aarch64/inline_strlen.h
@@ -17,7 +17,7 @@
namespace LIBC_NAMESPACE_DECL {
namespace neon {
-[[gnu::no_sanitize_address]] [[maybe_unused]] LIBC_INLINE static size_t
+[[maybe_unused]] LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static size_t
string_length(const char *src) {
using Vector __attribute__((may_alias)) = uint8x8_t;
diff --git a/libc/src/string/memory_utils/generic/inline_strlen.h b/libc/src/string/memory_utils/generic/inline_strlen.h
index d7435af..69700e8 100644
--- a/libc/src/string/memory_utils/generic/inline_strlen.h
+++ b/libc/src/string/memory_utils/generic/inline_strlen.h
@@ -24,8 +24,7 @@ LIBC_INLINE constexpr cpp::simd_mask<char> shift_mask(cpp::simd_mask<char> m,
return cpp::bit_cast<cpp::simd_mask<char>>(r);
}
-[[clang::no_sanitize("address")]] LIBC_INLINE size_t
-string_length(const char *src) {
+LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE size_t string_length(const char *src) {
constexpr cpp::simd<char> null_byte = cpp::splat('\0');
size_t alignment = alignof(cpp::simd<char>);
diff --git a/libc/src/string/memory_utils/x86_64/inline_strlen.h b/libc/src/string/memory_utils/x86_64/inline_strlen.h
index 739f8c1..9e10d58 100644
--- a/libc/src/string/memory_utils/x86_64/inline_strlen.h
+++ b/libc/src/string/memory_utils/x86_64/inline_strlen.h
@@ -18,12 +18,12 @@ namespace LIBC_NAMESPACE_DECL {
namespace string_length_internal {
// Return a bit-mask with the nth bit set if the nth-byte in block_ptr is zero.
template <typename Vector, typename Mask>
-[[gnu::no_sanitize_address]] LIBC_INLINE static Mask
+LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static Mask
compare_and_mask(const Vector *block_ptr);
template <typename Vector, typename Mask,
decltype(compare_and_mask<Vector, Mask>)>
-[[gnu::no_sanitize_address]] LIBC_INLINE static size_t
+LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static size_t
string_length_vector(const char *src) {
uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector);
diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h
index 9d636d0..7feef56 100644
--- a/libc/src/string/string_utils.h
+++ b/libc/src/string/string_utils.h
@@ -19,6 +19,7 @@
#include "hdr/types/size_t.h"
#include "src/__support/CPP/bitset.h"
#include "src/__support/CPP/type_traits.h" // cpp::is_same_v
+#include "src/__support/macros/attributes.h"
#include "src/__support/macros/config.h"
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
#include "src/string/memory_utils/inline_memcpy.h"
@@ -119,7 +120,7 @@ template <typename T> LIBC_INLINE size_t string_length(const T *src) {
}
template <typename Word>
-[[gnu::no_sanitize_address]] LIBC_INLINE void *
+LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE void *
find_first_character_wide_read(const unsigned char *src, unsigned char ch,
size_t n) {
const unsigned char *char_ptr = src;
diff --git a/libunwind/test/configs/cmake-bridge.cfg.in b/libunwind/test/configs/cmake-bridge.cfg.in
index b804c21..e40497b 100644
--- a/libunwind/test/configs/cmake-bridge.cfg.in
+++ b/libunwind/test/configs/cmake-bridge.cfg.in
@@ -14,6 +14,7 @@
import os, site
site.addsitedir(os.path.join('@LIBUNWIND_LIBCXX_PATH@', 'utils'))
import libcxx.test.format
+from lit.util import which
# Basic configuration of the test suite
config.name = os.path.basename('@LIBUNWIND_TEST_CONFIG@')
@@ -33,3 +34,13 @@ config.substitutions.append(('%{install-prefix}', '@LIBUNWIND_TESTING_INSTALL_PR
config.substitutions.append(('%{include}', '@LIBUNWIND_TESTING_INSTALL_PREFIX@/include'))
config.substitutions.append(('%{lib}', '@LIBUNWIND_TESTING_INSTALL_PREFIX@/@LIBUNWIND_INSTALL_LIBRARY_DIR@'))
config.substitutions.append(('%{benchmark_flags}', ''))
+
+# Check for objcopy tools
+objcopy_path = which('llvm-objcopy', '@LLVM_BUILD_BINARY_DIR@/bin')
+if not objcopy_path:
+ objcopy_path = which('llvm-objcopy')
+if not objcopy_path:
+ objcopy_path = which('objcopy')
+if objcopy_path:
+ config.substitutions.append(('%{objcopy}', objcopy_path))
+ config.available_features.add('objcopy-available')
diff --git a/libunwind/test/eh_frame_fde_pc_range.pass.cpp b/libunwind/test/eh_frame_fde_pc_range.pass.cpp
index 39c8e80..852612b 100644
--- a/libunwind/test/eh_frame_fde_pc_range.pass.cpp
+++ b/libunwind/test/eh_frame_fde_pc_range.pass.cpp
@@ -14,16 +14,15 @@
// clang-format off
// REQUIRES: target={{x86_64-.+-linux-gnu}}
-// aarch64,arm have a cross toolchain build(llvm-clang-win-x-aarch64, etc)
-// where objdump is not available.
+// REQUIRES: objcopy-available
// TODO: Figure out why this fails with Memory Sanitizer.
// XFAIL: msan
// RUN: %{build}
-// RUN: objcopy --dump-section .eh_frame_hdr=%t_ehf_hdr.bin %t.exe
+// RUN: %{objcopy} --dump-section .eh_frame_hdr=%t_ehf_hdr.bin %t.exe
// RUN: echo -ne '\xFF' | dd of=%t_ehf_hdr.bin bs=1 seek=2 count=2 conv=notrunc status=none
-// RUN: objcopy --update-section .eh_frame_hdr=%t_ehf_hdr.bin %t.exe
+// RUN: %{objcopy} --update-section .eh_frame_hdr=%t_ehf_hdr.bin %t.exe
// RUN: %{exec} %t.exe
// clang-format on
diff --git a/lldb/include/lldb/Target/Statistics.h b/lldb/include/lldb/Target/Statistics.h
index d6983bb..26538352 100644
--- a/lldb/include/lldb/Target/Statistics.h
+++ b/lldb/include/lldb/Target/Statistics.h
@@ -322,12 +322,14 @@ public:
void IncreaseSourceRealpathCompatibleCount(uint32_t count);
StatsDuration &GetCreateTime() { return m_create_time; }
+ StatsDuration &GetLoadCoreTime() { return m_load_core_time; }
StatsSuccessFail &GetExpressionStats() { return m_expr_eval; }
StatsSuccessFail &GetFrameVariableStats() { return m_frame_var; }
void Reset(Target &target);
protected:
StatsDuration m_create_time;
+ StatsDuration m_load_core_time;
std::optional<StatsTimepoint> m_launch_or_attach_time;
std::optional<StatsTimepoint> m_first_private_stop_time;
std::optional<StatsTimepoint> m_first_public_stop_time;
diff --git a/lldb/source/API/SBTarget.cpp b/lldb/source/API/SBTarget.cpp
index eb56337..90ffe78 100644
--- a/lldb/source/API/SBTarget.cpp
+++ b/lldb/source/API/SBTarget.cpp
@@ -255,6 +255,7 @@ SBProcess SBTarget::LoadCore(const char *core_file, lldb::SBError &error) {
ProcessSP process_sp(target_sp->CreateProcess(
target_sp->GetDebugger().GetListener(), "", &filespec, false));
if (process_sp) {
+ ElapsedTime loadCoreTime(target_sp->GetStatistics().GetLoadCoreTime());
error.SetError(process_sp->LoadCore());
if (error.Success())
sb_process.SetSP(process_sp);
diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index 940be42..b5fc49d 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -418,7 +418,11 @@ protected:
if (process_sp) {
// Seems weird that we Launch a core file, but that is what we
// do!
- error = process_sp->LoadCore();
+ {
+ ElapsedTime loadCoreTime(
+ target_sp->GetStatistics().GetLoadCoreTime());
+ error = process_sp->LoadCore();
+ }
if (error.Fail()) {
result.AppendError(error.AsCString("unknown core file format"));
diff --git a/lldb/source/Target/Statistics.cpp b/lldb/source/Target/Statistics.cpp
index 8ad8d50..f7311a8b 100644
--- a/lldb/source/Target/Statistics.cpp
+++ b/lldb/source/Target/Statistics.cpp
@@ -148,6 +148,11 @@ TargetStats::ToJSON(Target &target,
target_metrics_json.try_emplace("targetCreateTime",
m_create_time.get().count());
+ if (m_load_core_time.get().count() > 0) {
+ target_metrics_json.try_emplace("loadCoreTime",
+ m_load_core_time.get().count());
+ }
+
json::Array breakpoints_array;
double totalBreakpointResolveTime = 0.0;
// Report both the normal breakpoint list and the internal breakpoint list.
diff --git a/lldb/test/API/functionalities/json/symbol-file/Makefile b/lldb/test/API/functionalities/json/symbol-file/Makefile
index 13bc164..5d05d95f 100644
--- a/lldb/test/API/functionalities/json/symbol-file/Makefile
+++ b/lldb/test/API/functionalities/json/symbol-file/Makefile
@@ -1,4 +1,5 @@
C_SOURCES := main.c
+CFLAGS_EXTRAS := -no-pie
all: stripped.out
diff --git a/lldb/test/API/functionalities/stats_api/TestStatisticsAPI.py b/lldb/test/API/functionalities/stats_api/TestStatisticsAPI.py
index f06c9ae..d7249df 100644
--- a/lldb/test/API/functionalities/stats_api/TestStatisticsAPI.py
+++ b/lldb/test/API/functionalities/stats_api/TestStatisticsAPI.py
@@ -1,6 +1,7 @@
# Test the SBAPI for GetStatistics()
import json
+
import lldb
from lldbsuite.test.decorators import *
from lldbsuite.test.lldbtest import *
@@ -54,6 +55,11 @@ class TestStatsAPI(TestBase):
stats_json,
'Make sure the "frameVariable" key in in target.GetStatistics()["targets"][0]',
)
+ self.assertNotIn(
+ "loadCoreTime",
+ stats_json,
+ "LoadCoreTime should not be present in a live, non-coredump target",
+ )
expressionEvaluation = stats_json["expressionEvaluation"]
self.assertIn(
"successes",
@@ -157,3 +163,25 @@ class TestStatsAPI(TestBase):
stats_force.GetAsJSON(stream_force)
debug_stats_force = json.loads(stream_force.GetData())
self.assertEqual(debug_stats_force["totalDebugInfoByteSize"], 445)
+
+ def test_core_load_time(self):
+ """
+ Test to see if the coredump path is included in statistics dump.
+ """
+ yaml_file = "arm64-minidump-build-ids.yaml"
+ src_dir = self.getSourceDir()
+ minidump_path = self.getBuildArtifact(os.path.basename(yaml_file) + ".dmp")
+ self.yaml2obj(os.path.join(src_dir, yaml_file), minidump_path)
+ target = self.dbg.CreateTarget(None)
+ process = target.LoadCore(minidump_path)
+ self.assertTrue(process.IsValid())
+
+ stats_options = lldb.SBStatisticsOptions()
+ stats = target.GetStatistics(stats_options)
+ stream = lldb.SBStream()
+ stats.GetAsJSON(stream)
+ debug_stats = json.loads(stream.GetData())
+ self.assertTrue("targets" in debug_stats)
+ target_info = debug_stats["targets"][0]
+ self.assertTrue("loadCoreTime" in target_info)
+ self.assertTrue(float(target_info["loadCoreTime"]) > 0.0)
diff --git a/lldb/test/API/functionalities/stats_api/arm64-minidump-build-ids.yaml b/lldb/test/API/functionalities/stats_api/arm64-minidump-build-ids.yaml
new file mode 100644
index 0000000..4acbc40
--- /dev/null
+++ b/lldb/test/API/functionalities/stats_api/arm64-minidump-build-ids.yaml
@@ -0,0 +1,19 @@
+--- !minidump
+Streams:
+ - Type: SystemInfo
+ Processor Arch: ARM
+ Platform ID: Linux
+ CSD Version: '15E216'
+ CPU:
+ CPUID: 0x00000000
+ - Type: ModuleList
+ Modules:
+ - Base of Image: 0x0000000000001000
+ Size of Image: 0x00001000
+ Module Name: '/tmp/a'
+ CodeView Record: 4C4570420102030405060708090A0B0C0D0E0F1011121314
+ - Base of Image: 0x0000000000001000
+ Size of Image: 0x00001000
+ Module Name: '/tmp/b'
+ CodeView Record: 4C4570420A141E28323C46505A646E78828C96A0AAB4BEC8
+...
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index b981929..c450ee5 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -1011,6 +1011,9 @@ set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ${LLVM_ENABLE_PER_TARGET_RUNTIME_DIR_defa
set(LLVM_PROFDATA_FILE "" CACHE FILEPATH
"Profiling data file to use when compiling in order to improve runtime performance.")
+set(LLVM_SPROFDATA_FILE "" CACHE FILEPATH
+ "Sampling profiling data file to use when compiling in order to improve runtime performance.")
+
if(LLVM_INCLUDE_TESTS)
# All LLVM Python files should be compatible down to this minimum version.
set(LLVM_MINIMUM_PYTHON_VERSION 3.8)
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 8eca29f..d4195db6 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -1184,7 +1184,7 @@ if(LLVM_ENABLE_EH AND NOT LLVM_ENABLE_RTTI)
message(FATAL_ERROR "Exception handling requires RTTI. You must set LLVM_ENABLE_RTTI to ON")
endif()
-set(LLVM_BUILD_INSTRUMENTED OFF CACHE STRING "Build LLVM and tools with PGO instrumentation. May be specified as IR or Frontend")
+set(LLVM_BUILD_INSTRUMENTED OFF CACHE STRING "Build LLVM and tools with PGO instrumentation. May be specified as IR, Frontend, CSIR, CSSPGO")
set(LLVM_VP_COUNTERS_PER_SITE "1.5" CACHE STRING "Value profile counters to use per site for IR PGO with Clang")
mark_as_advanced(LLVM_BUILD_INSTRUMENTED LLVM_VP_COUNTERS_PER_SITE)
string(TOUPPER "${LLVM_BUILD_INSTRUMENTED}" uppercase_LLVM_BUILD_INSTRUMENTED)
@@ -1217,6 +1217,19 @@ if (LLVM_BUILD_INSTRUMENTED)
CMAKE_EXE_LINKER_FLAGS
CMAKE_SHARED_LINKER_FLAGS)
endif()
+ elseif(uppercase_LLVM_BUILD_INSTRUMENTED STREQUAL "CSSPGO")
+ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+ append("-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -fno-optimize-sibling-calls -fpseudo-probe-for-profiling -fdebug-info-for-profiling"
+ CMAKE_CXX_FLAGS
+ CMAKE_C_FLAGS)
+ if(NOT LINKER_IS_LLD_LINK)
+ append("-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -fno-optimize-sibling-calls -fpseudo-probe-for-profiling -fdebug-info-for-profiling"
+ CMAKE_EXE_LINKER_FLAGS
+ CMAKE_SHARED_LINKER_FLAGS)
+ endif()
+ else()
+ message(FATAL_ERROR "LLVM_BUILD_INSTRUMENTED=CSSPGO can only be specified when compiling with clang")
+ endif()
else()
append("-fprofile-instr-generate=\"${LLVM_PROFILE_FILE_PATTERN}\""
CMAKE_CXX_FLAGS
@@ -1269,6 +1282,21 @@ elseif(LLVM_PROFDATA_FILE)
message(WARNING "LLVM_PROFDATA_FILE specified, but ${LLVM_PROFDATA_FILE} not found")
endif()
+if(LLVM_SPROFDATA_FILE AND EXISTS ${LLVM_SPROFDATA_FILE})
+ if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" )
+ append("-fpseudo-probe-for-profiling -fprofile-sample-use=\"${LLVM_SPROFDATA_FILE}\""
+ CMAKE_CXX_FLAGS
+ CMAKE_C_FLAGS)
+ if(NOT LINKER_IS_LLD_LINK)
+ append("-fpseudo-probe-for-profiling -fprofile-sample-use=\"${LLVM_SPROFDATA_FILE}\""
+ CMAKE_EXE_LINKER_FLAGS
+ CMAKE_SHARED_LINKER_FLAGS)
+ endif()
+ else()
+ message(FATAL_ERROR "LLVM_SPROFDATA_FILE can only be specified when compiling with clang")
+ endif()
+endif()
+
option(LLVM_BUILD_INSTRUMENTED_COVERAGE "Build LLVM and tools with Code Coverage instrumentation" Off)
option(LLVM_INDIVIDUAL_TEST_COVERAGE "Emit individual coverage file for each test case." OFF)
mark_as_advanced(LLVM_BUILD_INSTRUMENTED_COVERAGE)
diff --git a/llvm/docs/AMDGPU/AMDGPUAsmGFX12.rst b/llvm/docs/AMDGPU/AMDGPUAsmGFX12.rst
new file mode 100644
index 0000000..7259ee87
--- /dev/null
+++ b/llvm/docs/AMDGPU/AMDGPUAsmGFX12.rst
@@ -0,0 +1,2002 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+====================================================================================
+Syntax of GFX12 Instructions
+====================================================================================
+
+.. contents::
+ :local:
+
+Introduction
+============
+
+This document describes the syntax of GFX12 instructions.
+
+Notation
+========
+
+Notation used in this document is explained :ref:`here<amdgpu_syn_instruction_notation>`.
+
+Overview
+========
+
+An overview of generic syntax and other features of AMDGPU instructions may be found :ref:`in this document<amdgpu_syn_instructions>`.
+
+Instructions
+============
+
+
+SMEM
+----
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **SRC3** **MODIFIERS**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ s_atc_probe :ref:`sdata<amdgpu_synid_gfx12_sdata_d725ab>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_atc_probe_buffer :ref:`sdata<amdgpu_synid_gfx12_sdata_d725ab>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_buffer_load_b128 :ref:`sdata<amdgpu_synid_gfx12_sdata_4585b8>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_buffer_load_b256 :ref:`sdata<amdgpu_synid_gfx12_sdata_0974a4>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_buffer_load_b32 :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_buffer_load_b512 :ref:`sdata<amdgpu_synid_gfx12_sdata_6c003b>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_buffer_load_b64 :ref:`sdata<amdgpu_synid_gfx12_sdata_354189>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_buffer_load_b96 :ref:`sdata<amdgpu_synid_gfx12_sdata_dd9dd8>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_buffer_load_i16 :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_buffer_load_i8 :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_buffer_load_u16 :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_buffer_load_u8 :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_buffer_nop :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_buffer_prefetch_data :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`, :ref:`ioffset<amdgpu_synid_gfx12_ioffset>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>`, :ref:`sdata<amdgpu_synid_gfx12_sdata_5c7b50>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_dcache_inv :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_load_b128 :ref:`sdata<amdgpu_synid_gfx12_sdata_4585b8>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_load_b256 :ref:`sdata<amdgpu_synid_gfx12_sdata_0974a4>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_load_b32 :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_load_b512 :ref:`sdata<amdgpu_synid_gfx12_sdata_6c003b>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_load_b64 :ref:`sdata<amdgpu_synid_gfx12_sdata_354189>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_load_b96 :ref:`sdata<amdgpu_synid_gfx12_sdata_dd9dd8>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_load_i16 :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_load_i8 :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_load_u16 :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_load_u8 :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`, :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_prefetch_data :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`, :ref:`ioffset<amdgpu_synid_gfx12_ioffset>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>`, :ref:`sdata<amdgpu_synid_gfx12_sdata_5c7b50>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_prefetch_data_pc_rel :ref:`ioffset<amdgpu_synid_gfx12_ioffset>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>`, :ref:`sdata<amdgpu_synid_gfx12_sdata_5c7b50>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_prefetch_inst :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`, :ref:`ioffset<amdgpu_synid_gfx12_ioffset>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>`, :ref:`sdata<amdgpu_synid_gfx12_sdata_5c7b50>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ s_prefetch_inst_pc_rel :ref:`ioffset<amdgpu_synid_gfx12_ioffset>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>`, :ref:`sdata<amdgpu_synid_gfx12_sdata_5c7b50>` :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+
+SOP1
+----
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST** **SRC**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ s_abs_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_alloc_vgpr :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_and_not0_saveexec_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_and_not0_saveexec_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_and_not0_wrexec_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_and_not0_wrexec_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_and_not1_saveexec_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_and_not1_saveexec_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_and_not1_wrexec_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_and_not1_wrexec_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_and_saveexec_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_and_saveexec_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_barrier_init :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_1a9ca5>`
+ s_barrier_join :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_1a9ca5>`
+ s_barrier_signal :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_1a9ca5>`
+ s_barrier_signal_isfirst :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_1a9ca5>`
+ s_bcnt0_i32_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_bcnt0_i32_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_bcnt1_i32_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_bcnt1_i32_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_bitreplicate_b64_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_bitset0_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_bitset0_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_bitset1_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_bitset1_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_brev_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_brev_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_ceil_f16 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_ceil_f32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_cls_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_cls_i32_i64 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_clz_i32_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_clz_i32_u64 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_cmov_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_cmov_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_ctz_i32_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_ctz_i32_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_cvt_f16_f32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_cvt_f32_f16 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_cvt_f32_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_cvt_f32_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_cvt_hi_f32_f16 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_cvt_i32_f32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_cvt_u32_f32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_floor_f16 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_floor_f32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_get_barrier_state :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_1a9ca5>`
+ s_get_lock_state :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_1a9ca5>`
+ s_getpc_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`
+ s_mov_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_mov_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_mov_fed_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_mov_from_global_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_007f9c>`
+ s_mov_from_global_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_2797bc>`
+ s_mov_regrd_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_mov_to_global_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_mov_to_global_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_movreld_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_movreld_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_movrels_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_007f9c>`
+ s_movrels_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_2797bc>`
+ s_movrelsd_2_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_007f9c>`
+ s_nand_saveexec_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_nand_saveexec_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_nor_saveexec_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_nor_saveexec_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_not_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_not_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_or_not0_saveexec_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_or_not0_saveexec_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_or_not1_saveexec_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_or_not1_saveexec_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_or_saveexec_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_or_saveexec_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_quadmask_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_quadmask_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_rfe_b64 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_2797bc>`
+ s_rndne_f16 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_rndne_f32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_sendmsg_rtn_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_245536>`
+ s_sendmsg_rtn_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_245536>`
+ s_setpc_b64 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_2797bc>`
+ s_sext_i32_i16 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_sext_i32_i8 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_sleep_var :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_swap_to_global_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_007f9c>`
+ s_swappc_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_2797bc>`
+ s_trunc_f16 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_trunc_f32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_try_lock :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_1a9ca5>`
+ s_unlock :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_1a9ca5>`
+ s_wakeup_barrier :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_1a9ca5>`
+ s_wqm_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_wqm_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_xnor_saveexec_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_xnor_saveexec_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+ s_xor_saveexec_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+ s_xor_saveexec_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+
+SOP2
+----
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ s_absdiff_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_add_co_ci_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_add_co_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_add_co_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_add_f16 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_add_f32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_add_nc_u64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+ s_and_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_and_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+ s_and_not1_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_and_not1_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+ s_ashr_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_ashr_i64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_bfe_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_bfe_i64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_bfe_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_bfe_u64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_bfm_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_bfm_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cselect_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cselect_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+ s_cvt_pk_rtz_f16_f32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_fmaak_f32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ s_fmac_f16 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_fmac_f32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_fmamk_f32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_lshl1_add_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_lshl2_add_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_lshl3_add_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_lshl4_add_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_lshl_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_lshl_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_lshr_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_lshr_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_max_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_max_num_f16 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_max_num_f32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_max_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_maximum_f16 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_maximum_f32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_min_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_min_num_f16 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_min_num_f32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_min_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_minimum_f16 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_minimum_f32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_mul_f16 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_mul_f32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_mul_hi_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_mul_hi_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_mul_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_mul_u64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+ s_nand_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_nand_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+ s_nor_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_nor_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+ s_or_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_or_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+ s_or_not1_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_or_not1_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+ s_pack_hh_b32_b16 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_pack_hl_b32_b16 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_pack_lh_b32_b16 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_pack_ll_b32_b16 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_sub_co_ci_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_sub_co_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_sub_co_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_sub_f16 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_sub_f32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_sub_nc_u64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+ s_xnor_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_xnor_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+ s_xor_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_xor_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+
+SOPC
+----
+
+.. parsed-literal::
+
+ **INSTRUCTION** **SRC0** **SRC1**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ s_bitcmp0_b32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_bitcmp0_b64 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_bitcmp1_b32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_bitcmp1_b64 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_eq_f16 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_eq_f32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_eq_i32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_eq_u32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_eq_u64 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+ s_cmp_ge_f16 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_ge_f32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_ge_i32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_ge_u32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_gt_f16 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_gt_f32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_gt_i32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_gt_u32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_le_f16 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_le_f32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_le_i32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_le_u32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_lg_f16 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_lg_f32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_lg_i32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_lg_u32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_lg_u64 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+ s_cmp_lt_f16 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_lt_f32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_lt_i32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_lt_u32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_neq_f16 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_neq_f32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_nge_f16 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_nge_f32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_ngt_f16 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_ngt_f32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_nle_f16 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_nle_f32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_nlg_f16 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_nlg_f32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_nlt_f16 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_nlt_f32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_o_f16 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_o_f32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_u_f16 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+ s_cmp_u_f32 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`, :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+
+SOPK
+----
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST** **SRC**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ s_addk_co_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_call_b64 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+ s_cmovk_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_cmpk_eq_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_cmpk_eq_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_cmpk_ge_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_cmpk_ge_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_cmpk_gt_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_cmpk_gt_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_cmpk_le_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_cmpk_le_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_cmpk_lg_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_cmpk_lg_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_cmpk_lt_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_cmpk_lt_u32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_getreg_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_7ed651>`
+ s_getreg_regrd_b32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_7ed651>`
+ s_movk_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_mulk_i32 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_setreg_b32 :ref:`simm16<amdgpu_synid_gfx12_simm16_cc1716>`, :ref:`sdst<amdgpu_synid_gfx12_sdst_20064d>`
+ s_setreg_imm32_b32 :ref:`simm16<amdgpu_synid_gfx12_simm16_cc1716>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ s_subvector_loop_begin :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+ s_subvector_loop_end :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`, :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+ s_version :ref:`simm16<amdgpu_synid_gfx12_simm16_15ccdd>`
+
+SOPP
+----
+
+.. parsed-literal::
+
+ **INSTRUCTION** **SRC**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ s_barrier
+ s_barrier_leave
+ s_barrier_wait :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_branch :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+ s_cbranch_cdbgsys :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+ s_cbranch_cdbgsys_and_user :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+ s_cbranch_cdbgsys_or_user :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+ s_cbranch_cdbguser :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+ s_cbranch_execnz :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+ s_cbranch_execz :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+ s_cbranch_scc0 :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+ s_cbranch_scc1 :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+ s_cbranch_vccnz :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+ s_cbranch_vccz :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+ s_clause :ref:`simm16<amdgpu_synid_gfx12_simm16_730a13>`
+ s_code_end
+ s_decperflevel :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_delay_alu :ref:`simm16<amdgpu_synid_gfx12_simm16_c98889>`
+ s_denorm_mode :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_endpgm
+ s_endpgm_ordered_ps_done
+ s_endpgm_saved
+ s_icache_inv
+ s_incperflevel :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_nop :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_round_mode :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_sendmsg :ref:`simm16<amdgpu_synid_gfx12_simm16_ee8b30>`
+ s_sendmsghalt :ref:`simm16<amdgpu_synid_gfx12_simm16_ee8b30>`
+ s_set_inst_prefetch_distance :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_sethalt :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_setkill :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_setprio :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_singleuse_vdst :ref:`simm16<amdgpu_synid_gfx12_simm16_81e671>`
+ s_sleep :ref:`simm16<amdgpu_synid_gfx12_simm16_81e671>`
+ s_trap :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_ttracedata
+ s_ttracedata_imm :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_wait_alu :ref:`simm16<amdgpu_synid_gfx12_simm16_81e671>`
+ s_wait_bvhcnt :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_wait_dscnt :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_wait_event :ref:`simm16<amdgpu_synid_gfx12_simm16_81e671>`
+ s_wait_expcnt :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_wait_idle
+ s_wait_kmcnt :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_wait_loadcnt :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_wait_loadcnt_dscnt :ref:`simm16<amdgpu_synid_gfx12_simm16_81e671>`
+ s_wait_samplecnt :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_wait_storecnt :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+ s_wait_storecnt_dscnt :ref:`simm16<amdgpu_synid_gfx12_simm16_81e671>`
+ s_waitcnt :ref:`simm16<amdgpu_synid_gfx12_simm16_218bea>`
+ s_wakeup
+
+VBUFFER
+-------
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **MODIFIERS**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ buffer_atomic_add_f32 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_add_u32 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_add_u64 :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_and_b32 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_and_b64 :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_cmpswap_b32 :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_cmpswap_b64 :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_cond_sub_u32 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_dec_u32 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_dec_u64 :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_inc_u32 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_inc_u64 :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_max_i32 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_max_i64 :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_max_num_f32 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_max_u32 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_max_u64 :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_min_i32 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_min_i64 :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_min_num_f32 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_min_u32 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_min_u64 :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_or_b32 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_or_b64 :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_pk_add_bf16 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_pk_add_f16 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_sub_clamp_u32 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_sub_u32 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_sub_u64 :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_swap_b32 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_swap_b64 :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_xor_b32 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_atomic_xor_b64 :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_gl0_inv :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_gl1_inv :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_b128 :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_b32 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_b64 :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_b96 :ref:`vdata<amdgpu_synid_gfx12_vdata_48e42f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_block :ref:`vdata<amdgpu_synid_gfx12_vdata_2eda77>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_d16_b16 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_d16_format_x :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_d16_format_xy :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_d16_format_xyz :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_d16_format_xyzw :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_d16_hi_b16 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_d16_hi_format_x :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_d16_hi_i8 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_d16_hi_u8 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_d16_i8 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_d16_u8 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_format_x :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_format_xy :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_format_xyz :ref:`vdata<amdgpu_synid_gfx12_vdata_48e42f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_format_xyzw :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_i16 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_i8 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_lds_b32 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_lds_format_x :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_lds_i16 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_lds_i8 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_lds_u16 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_lds_u8 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_u16 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_load_u8 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_nop :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_store_b128 :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_store_b16 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_store_b32 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_store_b64 :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_store_b8 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_store_b96 :ref:`vdata<amdgpu_synid_gfx12_vdata_48e42f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_store_block :ref:`vdata<amdgpu_synid_gfx12_vdata_2eda77>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_store_d16_format_x :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_store_d16_format_xy :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_store_d16_format_xyz :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_store_d16_format_xyzw :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_store_d16_hi_b16 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_store_d16_hi_b8 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_store_d16_hi_format_x :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_store_format_x :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_store_format_xy :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_store_format_xyz :ref:`vdata<amdgpu_synid_gfx12_vdata_48e42f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ buffer_store_format_xyzw :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ tbuffer_load_d16_format_x :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ tbuffer_load_d16_format_xy :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ tbuffer_load_d16_format_xyz :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ tbuffer_load_d16_format_xyzw :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ tbuffer_load_format_x :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ tbuffer_load_format_xy :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ tbuffer_load_format_xyz :ref:`vdata<amdgpu_synid_gfx12_vdata_48e42f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ tbuffer_load_format_xyzw :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ tbuffer_store_d16_format_x :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ tbuffer_store_d16_format_xy :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ tbuffer_store_d16_format_xyz :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ tbuffer_store_d16_format_xyzw :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ tbuffer_store_format_x :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ tbuffer_store_format_xy :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ tbuffer_store_format_xyz :ref:`vdata<amdgpu_synid_gfx12_vdata_48e42f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ tbuffer_store_format_xyzw :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+
+VDS
+---
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **MODIFIERS**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ ds_add_f32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_add_f64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_add_rtn_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_add_rtn_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_add_rtn_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_add_u32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_add_u64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_and_b32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_and_b64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_and_rtn_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_and_rtn_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_append :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_bpermute_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_bpermute_fi_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_bpermute_fi_from_global_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_bpermute_fi_to_global_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_bpermute_fi_to_simd_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_bpermute_from_global_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_bpermute_to_global_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_bpermute_to_simd_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_bvh_stack_push4_pop1_rtn_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`, :ref:`data1<amdgpu_synid_gfx12_data1_e016a1>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_bvh_stack_push8_pop1_rtn_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`, :ref:`data1<amdgpu_synid_gfx12_data1_731030>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_bvh_stack_push8_pop2_rtn_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`, :ref:`data1<amdgpu_synid_gfx12_data1_731030>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_cmpstore_b32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`, :ref:`data1<amdgpu_synid_gfx12_data1_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_cmpstore_b64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`, :ref:`data1<amdgpu_synid_gfx12_data1_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_cmpstore_rtn_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`, :ref:`data1<amdgpu_synid_gfx12_data1_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_cmpstore_rtn_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`, :ref:`data1<amdgpu_synid_gfx12_data1_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_cond_sub_rtn_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_cond_sub_u32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_condxchg32_rtn_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_consume :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_dec_rtn_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_dec_rtn_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_dec_u32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_dec_u64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_inc_rtn_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_inc_rtn_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_inc_u32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_inc_u64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_load_2addr_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_load_2addr_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`, :ref:`addr<amdgpu_synid_gfx12_addr>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_load_2addr_stride64_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_load_2addr_stride64_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`, :ref:`addr<amdgpu_synid_gfx12_addr>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_load_addtid_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_load_b128 :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`, :ref:`addr<amdgpu_synid_gfx12_addr>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_load_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_load_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_load_b96 :ref:`vdst<amdgpu_synid_gfx12_vdst_48e42f>`, :ref:`addr<amdgpu_synid_gfx12_addr>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_load_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_load_i8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_load_i8_d16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_load_i8_d16_hi :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_load_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_load_u16_d16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_load_u16_d16_hi :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_load_u8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_load_u8_d16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_load_u8_d16_hi :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_max_i32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_max_i64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_max_num_f32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_max_num_f64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_max_num_rtn_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_max_num_rtn_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_max_rtn_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_max_rtn_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_max_rtn_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_max_rtn_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_max_u32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_max_u64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_min_i32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_min_i64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_min_num_f32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_min_num_f64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_min_num_rtn_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_min_num_rtn_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_min_rtn_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_min_rtn_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_min_rtn_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_min_rtn_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_min_u32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_min_u64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_mskor_b32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`, :ref:`data1<amdgpu_synid_gfx12_data1_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_mskor_b64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`, :ref:`data1<amdgpu_synid_gfx12_data1_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_mskor_rtn_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`, :ref:`data1<amdgpu_synid_gfx12_data1_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_mskor_rtn_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`, :ref:`data1<amdgpu_synid_gfx12_data1_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_nop :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_or_b32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_or_b64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_or_rtn_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_or_rtn_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_permute_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_permute_from_global_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_permute_to_global_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_permute_to_simd_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_pk_add_bf16 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_pk_add_f16 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_pk_add_rtn_bf16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_pk_add_rtn_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_rsub_rtn_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_rsub_rtn_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_rsub_u32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_rsub_u64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_store_2addr_b32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`, :ref:`data1<amdgpu_synid_gfx12_data1_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_store_2addr_b64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`, :ref:`data1<amdgpu_synid_gfx12_data1_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_store_2addr_stride64_b32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`, :ref:`data1<amdgpu_synid_gfx12_data1_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_store_2addr_stride64_b64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`, :ref:`data1<amdgpu_synid_gfx12_data1_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_store_addtid_b32 :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_store_b128 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_e016a1>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_store_b16 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_store_b16_d16_hi :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_store_b32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_store_b64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_store_b8 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_store_b8_d16_hi :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_store_b96 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_56f215>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_storexchg_2addr_rtn_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`, :ref:`data1<amdgpu_synid_gfx12_data1_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_storexchg_2addr_rtn_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`, :ref:`data1<amdgpu_synid_gfx12_data1_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_storexchg_2addr_stride64_rtn_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`, :ref:`data1<amdgpu_synid_gfx12_data1_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_storexchg_2addr_stride64_rtn_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`, :ref:`data1<amdgpu_synid_gfx12_data1_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_storexchg_rtn_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_storexchg_rtn_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_sub_clamp_rtn_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_sub_clamp_u32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_sub_rtn_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_sub_rtn_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_sub_u32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_sub_u64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_swizzle_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_wrap_rtn_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`, :ref:`data1<amdgpu_synid_gfx12_data1_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_xor_b32 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_xor_b64 :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_xor_rtn_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+ ds_xor_rtn_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`addr<amdgpu_synid_gfx12_addr>`, :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+
+VDSDIR
+------
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST** **SRC** **MODIFIERS**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ ds_direct_load :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>` :ref:`wait_va_vdst<amdgpu_synid_wait_va_vdst>` :ref:`wait_vdst<amdgpu_synid_wait_vdst>` :ref:`wait_vm_vsrc<amdgpu_synid_wait_vm_vsrc>`
+ ds_param_load :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`attr<amdgpu_synid_gfx12_attr>` :ref:`wait_va_vdst<amdgpu_synid_wait_va_vdst>` :ref:`wait_vdst<amdgpu_synid_wait_vdst>` :ref:`wait_vm_vsrc<amdgpu_synid_wait_vm_vsrc>`
+
+VERIF
+-----
+
+.. parsed-literal::
+
+ **INSTRUCTION**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ fake_s_delay_alu
+ fake_s_nop
+ fake_s_wait_alu
+ fake_s_wait_bvhcnt
+ fake_s_wait_dscnt
+ fake_s_wait_expcnt
+ fake_s_wait_kmcnt
+ fake_s_wait_loadcnt
+ fake_s_wait_samplecnt
+ fake_s_wait_storecnt
+ fake_s_waitcnt
+ fake_v_nop
+ ill_0
+ ill_1
+ ill_beef
+ metadata
+ verif_s_adjdelay_alu
+
+VEXPORT
+-------
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **SRC3** **MODIFIERS**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ export :ref:`tgt<amdgpu_synid_gfx12_tgt>`, :ref:`vsrc0<amdgpu_synid_gfx12_vsrc0>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`, :ref:`vsrc2<amdgpu_synid_gfx12_vsrc2>`, :ref:`vsrc3<amdgpu_synid_gfx12_vsrc3>` :ref:`done<amdgpu_synid_done>` :ref:`row_en<amdgpu_synid_row_en>`
+
+VFLAT
+-----
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST** **SRC0** **SRC1** **MODIFIERS**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ flat_atomic_add_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_add_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_add_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_and_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_and_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_cmpswap_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_cmpswap_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_e016a1>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_cond_sub_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_dec_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_dec_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_inc_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_inc_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_max_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_max_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_max_num_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_max_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_max_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_min_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_min_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_min_num_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_min_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_min_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_or_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_or_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_pk_add_bf16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_pk_add_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_sub_clamp_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_sub_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_sub_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_swap_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_swap_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_xor_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_atomic_xor_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_load_b128 :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_load_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_load_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_load_b96 :ref:`vdst<amdgpu_synid_gfx12_vdst_48e42f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_load_d16_b16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_load_d16_hi_b16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_load_d16_hi_i8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_load_d16_hi_u8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_load_d16_i8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_load_d16_u8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_load_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_load_i8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_load_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_load_u8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_store_b128 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_e016a1>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_store_b16 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_store_b32 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_store_b64 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_store_b8 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_store_b96 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_56f215>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_store_d16_hi_b16 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ flat_store_d16_hi_b8 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+
+VGLOBAL
+-------
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **MODIFIERS**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ global_atomic_add_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_add_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_add_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_and_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_and_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_cmpswap_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_cmpswap_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_e016a1>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_cond_sub_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_dec_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_dec_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_inc_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_inc_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_max_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_max_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_max_num_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_max_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_max_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_min_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_min_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_min_num_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_min_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_min_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_or_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_or_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_ordered_add_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_pk_add_bf16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_pk_add_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_sub_clamp_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_sub_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_sub_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_swap_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_swap_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_xor_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_atomic_xor_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_inv :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_addtid_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_b128 :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_b96 :ref:`vdst<amdgpu_synid_gfx12_vdst_48e42f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_block :ref:`vdst<amdgpu_synid_gfx12_vdst_2eda77>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_d16_b16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_d16_hi_b16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_d16_hi_i8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_d16_hi_u8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_d16_i8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_d16_u8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_i8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_lds_addtid_b32 :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_lds_b32 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_lds_i16 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_lds_i8 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_lds_u16 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_lds_u8 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_tr_b128 :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_tr_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_load_u8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_store_addtid_b32 :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_store_b128 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_e016a1>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_store_b16 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_store_b32 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_store_b64 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_store_b8 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_store_b96 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_56f215>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_store_block :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_89fd7b>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_store_d16_hi_b16 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_store_d16_hi_b8 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_wb :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ global_wbinv :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+
+VIMAGE
+------
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST** **SRC0** **SRC1** **MODIFIERS**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ image_atomic_add_flt :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_atomic_add_uint :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_atomic_and :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_atomic_cmpswap :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_atomic_dec_uint :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_atomic_inc_uint :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_atomic_max_flt :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_atomic_max_int :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_atomic_max_uint :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_atomic_min_flt :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_atomic_min_int :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_atomic_min_uint :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_atomic_or :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_atomic_pk_add_bf16 :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_atomic_pk_add_f16 :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_atomic_sub_uint :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_atomic_swap :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_atomic_xor :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_bvh64_intersect_ray :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c12f43>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_bvh8_intersect_ray :ref:`vdata<amdgpu_synid_gfx12_vdata_aac3e8>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_a972b9>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_bvh_dual_intersect_ray :ref:`vdata<amdgpu_synid_gfx12_vdata_aac3e8>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c12f43>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_bvh_intersect_ray :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_a972b9>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_get_resinfo :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_load :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_load_mip :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_load_mip_pck :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_load_mip_pck_sgn :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_load_pck :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_load_pck_sgn :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_rsvd_atomic_umax_8 :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_rsvd_atomic_umin_8 :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_store :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_store_mip :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_store_mip_pck :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_store_pck :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+
+VINTERP
+-------
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **MODIFIERS**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ v_interp_p10_f16_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`, :ref:`src1<amdgpu_synid_gfx12_src1_6802ce>`, :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`wait_exp<amdgpu_synid_wait_exp>`
+ v_interp_p10_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`, :ref:`src1<amdgpu_synid_gfx12_src1_6802ce>`, :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`wait_exp<amdgpu_synid_wait_exp>`
+ v_interp_p10_rtz_f16_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`, :ref:`src1<amdgpu_synid_gfx12_src1_6802ce>`, :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`wait_exp<amdgpu_synid_wait_exp>`
+ v_interp_p2_f16_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`, :ref:`src1<amdgpu_synid_gfx12_src1_6802ce>`, :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`wait_exp<amdgpu_synid_wait_exp>`
+ v_interp_p2_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`, :ref:`src1<amdgpu_synid_gfx12_src1_6802ce>`, :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`wait_exp<amdgpu_synid_wait_exp>`
+ v_interp_p2_rtz_f16_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`, :ref:`src1<amdgpu_synid_gfx12_src1_6802ce>`, :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>` :ref:`clamp<amdgpu_synid_clamp>` :ref:`wait_exp<amdgpu_synid_wait_exp>`
+
+VOP1
+----
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST** **SRC** **MODIFIERS**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ v_bfrev_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_ceil_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_ceil_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_ceil_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cls_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_clz_i32_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cos_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cos_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_ctz_i32_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_f16_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_f16_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_f16_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_f32_bf8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_f32_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_f32_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_f32_fp8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_f32_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_f32_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_f32_ubyte0 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_f32_ubyte1 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_f32_ubyte2 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_f32_ubyte3 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_f64_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_f64_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_f64_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_floor_i32_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_i16_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_i32_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_i32_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_i32_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_nearest_i32_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_norm_i16_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_norm_u16_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_off_f32_i4 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_pk_f32_bf8 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_pk_f32_fp8 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_u16_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_u32_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_u32_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_u32_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_exp_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_exp_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_floor_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_floor_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_floor_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_fract_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_fract_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_fract_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_frexp_exp_i16_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_frexp_exp_i32_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_frexp_exp_i32_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_frexp_mant_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_frexp_mant_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_frexp_mant_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_log_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_log_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mov_b16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mov_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mov_fed_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mov_from_global_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mov_to_global_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_movreld_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_movrels_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_movrelsd_2_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_movrelsd_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_nop :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_not_b16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_not_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_permlane64_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_pipeflush :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_rcp_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_rcp_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_rcp_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_rcp_iflag_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_readfirstlane_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_rndne_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_rndne_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_rndne_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_rsq_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_rsq_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_rsq_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_sat_pk_u8_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_sin_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_sin_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_sqrt_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_sqrt_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_sqrt_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_swap_b16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_swap_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_swaprel_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_trunc_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_trunc_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_trunc_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_writelane_regwr_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+
+VOP2
+----
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST0** **DST1** **SRC0** **SRC1** **SRC2** **MODIFIERS**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ v_add_co_ci_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`sdst<amdgpu_synid_gfx12_sdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_add_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_add_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_add_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_add_nc_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_add_nc_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_and_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_ashrrev_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cndmask_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_pk_rtz_f16_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_fmaak_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_fmaak_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_fmaak_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`literal<amdgpu_synid_gfx12_literal_1f74c7>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_fmac_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_fmac_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_fmac_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_fmamk_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_fmamk_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_fmamk_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`literal<amdgpu_synid_gfx12_literal_1f74c7>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_illegal :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_ldexp_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_lshlrev_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_lshlrev_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_lshrrev_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_max_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_max_num_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_max_num_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_max_num_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_max_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_min_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_min_num_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_min_num_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_min_num_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_min_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mul_dx9_zero_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mul_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mul_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mul_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mul_hi_i32_i24 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mul_hi_u32_u24 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mul_i32_i24 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mul_u32_u24 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mul_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_or_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_pk_fmac_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_sub_co_ci_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`sdst<amdgpu_synid_gfx12_sdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_sub_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_sub_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_sub_nc_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_sub_nc_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_subrev_co_ci_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`sdst<amdgpu_synid_gfx12_sdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_subrev_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_subrev_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_subrev_nc_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_xnor_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_xor_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+
+VOP3
+----
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST0** **DST1** **SRC0** **SRC1** **SRC2** **MODIFIERS**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ v_add3_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_add_co_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`sdst<amdgpu_synid_gfx12_sdst_e701cc>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_add_lshl_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_add_nc_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_add_nc_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_add_nc_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_alignbit_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_alignbyte_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_and_b16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_and_or_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_ashrrev_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_ashrrev_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_bcnt_u32_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_bfe_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_bfe_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_bfi_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_bfm_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cndmask_b16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_2797bc>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cubeid_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cubema_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cubesc_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cubetc_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_pk_bf8_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_pk_fp8_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_pk_i16_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_pk_i16_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_pk_norm_i16_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_pk_norm_i16_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_pk_norm_u16_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_pk_norm_u16_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_pk_u16_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_pk_u16_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_pk_u8_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_sr_bf8_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cvt_sr_fp8_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_div_fixup_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_div_fixup_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_div_fixup_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_div_fmas_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_div_fmas_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_div_scale_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`sdst<amdgpu_synid_gfx12_sdst_e701cc>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_div_scale_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`sdst<amdgpu_synid_gfx12_sdst_e701cc>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_dot2_bf16_bf16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_dot2_f16_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_fma_dx9_zero_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_fma_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_fma_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_fma_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_ldexp_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_ldexp_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_lerp_u8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_lshl_add_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_lshl_add_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_lshl_or_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_lshlrev_b16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_lshrrev_b16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_lshrrev_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mad_co_i64_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`sdst<amdgpu_synid_gfx12_sdst_e701cc>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mad_co_u64_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`sdst<amdgpu_synid_gfx12_sdst_e701cc>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mad_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mad_i32_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mad_i32_i24 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mad_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mad_u32_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mad_u32_u24 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_max3_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_max3_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_max3_num_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_max3_num_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_max3_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_max3_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_max_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_max_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_maximum3_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_maximum3_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_maximum_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_maximum_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_maximum_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_maximumminimum_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_maximumminimum_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_maxmin_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_maxmin_num_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_maxmin_num_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_maxmin_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mbcnt_hi_u32_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mbcnt_lo_u32_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_med3_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_med3_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_med3_num_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_med3_num_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_med3_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_med3_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_min3_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_min3_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_min3_num_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_min3_num_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_min3_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_min3_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_min_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_min_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_minimum3_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_minimum3_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_minimum_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_minimum_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_minimum_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_minimummaximum_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_minimummaximum_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_minmax_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_minmax_num_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_minmax_num_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_minmax_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mqsad_pk_u16_u8 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mqsad_u32_u8 :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_e016a1>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_msad_u8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mul_hi_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mul_hi_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mul_lo_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mul_lo_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_mullit_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_or3_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_or_b16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_pack_b32_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_perm_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_permlane16_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_permlane16_var_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_permlanex16_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_permlanex16_var_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_qsad_pk_u16_u8 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_readlane_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_977794>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_readlane_regrd_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_977794>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_s_exp_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`, :ref:`src0<amdgpu_synid_gfx12_src0_85aab6>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_s_exp_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`, :ref:`src0<amdgpu_synid_gfx12_src0_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_s_log_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`, :ref:`src0<amdgpu_synid_gfx12_src0_85aab6>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_s_log_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`, :ref:`src0<amdgpu_synid_gfx12_src0_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_s_rcp_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`, :ref:`src0<amdgpu_synid_gfx12_src0_85aab6>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_s_rcp_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`, :ref:`src0<amdgpu_synid_gfx12_src0_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_s_rsq_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`, :ref:`src0<amdgpu_synid_gfx12_src0_85aab6>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_s_rsq_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`, :ref:`src0<amdgpu_synid_gfx12_src0_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_s_sqrt_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`, :ref:`src0<amdgpu_synid_gfx12_src0_85aab6>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_s_sqrt_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`, :ref:`src0<amdgpu_synid_gfx12_src0_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_sad_hi_u8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_sad_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_sad_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_sad_u8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_sub_co_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`sdst<amdgpu_synid_gfx12_sdst_e701cc>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_sub_nc_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_sub_nc_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_sub_nc_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_subrev_co_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`sdst<amdgpu_synid_gfx12_sdst_e701cc>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_trig_preop_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_writelane_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_977794>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_xad_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_xor3_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_xor_b16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+
+VOP3P
+-----
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ v_dot2_f32_bf16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+ v_dot2_f32_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+ v_dot4_f32_bf8_bf8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+ v_dot4_f32_bf8_fp8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+ v_dot4_f32_fp8_bf8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+ v_dot4_f32_fp8_fp8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+ v_dot4_i32_iu8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+ v_dot4_u32_u8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+ v_dot8_i32_iu4 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+ v_dot8_u32_u4 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+ v_fma_mix_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+ v_fma_mixhi_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+ v_fma_mixlo_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+ v_pk_add_bf16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+ v_pk_add_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+ v_pk_add_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+ v_pk_add_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+ v_pk_ashrrev_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+ v_pk_fma_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+ v_pk_fma_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`, :ref:`src1<amdgpu_synid_gfx12_src1_5cae62>`, :ref:`src2<amdgpu_synid_gfx12_src2_5cae62>`
+ v_pk_lshlrev_b16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+ v_pk_lshrrev_b16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+ v_pk_mad_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+ v_pk_mad_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`, :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+ v_pk_max_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+ v_pk_max_num_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+ v_pk_max_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+ v_pk_maximum_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+ v_pk_min_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+ v_pk_min_num_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+ v_pk_min_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+ v_pk_minimum_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+ v_pk_mul_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+ v_pk_mul_lo_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+ v_pk_sub_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+ v_pk_sub_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`, :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+ v_swmmac_bf16_16x16x32_bf16 :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`, :ref:`src0<amdgpu_synid_gfx12_src0_e016a1>`, :ref:`src1<amdgpu_synid_gfx12_src1_731030>`, :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+ v_swmmac_f16_16x16x32_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`, :ref:`src0<amdgpu_synid_gfx12_src0_e016a1>`, :ref:`src1<amdgpu_synid_gfx12_src1_731030>`, :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+ v_swmmac_f32_16x16x32_bf16 :ref:`vdst<amdgpu_synid_gfx12_vdst_47d3bc>`, :ref:`src0<amdgpu_synid_gfx12_src0_e016a1>`, :ref:`src1<amdgpu_synid_gfx12_src1_731030>`, :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+ v_swmmac_f32_16x16x32_bf8_bf8 :ref:`vdst<amdgpu_synid_gfx12_vdst_47d3bc>`, :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`, :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`, :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+ v_swmmac_f32_16x16x32_bf8_fp8 :ref:`vdst<amdgpu_synid_gfx12_vdst_47d3bc>`, :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`, :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`, :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+ v_swmmac_f32_16x16x32_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_47d3bc>`, :ref:`src0<amdgpu_synid_gfx12_src0_e016a1>`, :ref:`src1<amdgpu_synid_gfx12_src1_731030>`, :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+ v_swmmac_f32_16x16x32_fp8_bf8 :ref:`vdst<amdgpu_synid_gfx12_vdst_47d3bc>`, :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`, :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`, :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+ v_swmmac_f32_16x16x32_fp8_fp8 :ref:`vdst<amdgpu_synid_gfx12_vdst_47d3bc>`, :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`, :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`, :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+ v_swmmac_i32_16x16x32_iu4 :ref:`vdst<amdgpu_synid_gfx12_vdst_47d3bc>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`, :ref:`src1<amdgpu_synid_gfx12_src1_fd235e>`, :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+ v_swmmac_i32_16x16x32_iu8 :ref:`vdst<amdgpu_synid_gfx12_vdst_47d3bc>`, :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`, :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`, :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+ v_swmmac_i32_16x16x64_iu4 :ref:`vdst<amdgpu_synid_gfx12_vdst_47d3bc>`, :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`, :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`, :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+ v_wmma_bf16_16x16x16_bf16 :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`, :ref:`src0<amdgpu_synid_gfx12_src0_e016a1>`, :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`, :ref:`src2<amdgpu_synid_gfx12_src2_7b936a>`
+ v_wmma_f16_16x16x16_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`, :ref:`src0<amdgpu_synid_gfx12_src0_e016a1>`, :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`, :ref:`src2<amdgpu_synid_gfx12_src2_7b936a>`
+ v_wmma_f32_16x16x16_bf16 :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`, :ref:`src0<amdgpu_synid_gfx12_src0_e016a1>`, :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`, :ref:`src2<amdgpu_synid_gfx12_src2_96fbd3>`
+ v_wmma_f32_16x16x16_bf8_bf8 :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`, :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`, :ref:`src1<amdgpu_synid_gfx12_src1_fd235e>`, :ref:`src2<amdgpu_synid_gfx12_src2_96fbd3>`
+ v_wmma_f32_16x16x16_bf8_fp8 :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`, :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`, :ref:`src1<amdgpu_synid_gfx12_src1_fd235e>`, :ref:`src2<amdgpu_synid_gfx12_src2_96fbd3>`
+ v_wmma_f32_16x16x16_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`, :ref:`src0<amdgpu_synid_gfx12_src0_e016a1>`, :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`, :ref:`src2<amdgpu_synid_gfx12_src2_96fbd3>`
+ v_wmma_f32_16x16x16_fp8_bf8 :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`, :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`, :ref:`src1<amdgpu_synid_gfx12_src1_fd235e>`, :ref:`src2<amdgpu_synid_gfx12_src2_96fbd3>`
+ v_wmma_f32_16x16x16_fp8_fp8 :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`, :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`, :ref:`src1<amdgpu_synid_gfx12_src1_fd235e>`, :ref:`src2<amdgpu_synid_gfx12_src2_96fbd3>`
+ v_wmma_i32_16x16x16_iu4 :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`, :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`, :ref:`src1<amdgpu_synid_gfx12_src1_6802ce>`, :ref:`src2<amdgpu_synid_gfx12_src2_96fbd3>`
+ v_wmma_i32_16x16x16_iu8 :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`, :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`, :ref:`src1<amdgpu_synid_gfx12_src1_fd235e>`, :ref:`src2<amdgpu_synid_gfx12_src2_96fbd3>`
+ v_wmma_i32_16x16x32_iu4 :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`, :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`, :ref:`src1<amdgpu_synid_gfx12_src1_fd235e>`, :ref:`src2<amdgpu_synid_gfx12_src2_96fbd3>`
+
+VOPC
+----
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST** **SRC0** **SRC1** **MODIFIERS**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ v_cmp_class_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_class_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_class_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_eq_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_eq_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_eq_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_eq_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_eq_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_eq_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_eq_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_eq_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_eq_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_f_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_f_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_f_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_f_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_f_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_f_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_f_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_ge_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_ge_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_ge_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_ge_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_ge_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_ge_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_ge_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_ge_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_ge_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_gt_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_gt_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_gt_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_gt_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_gt_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_gt_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_gt_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_gt_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_gt_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_le_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_le_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_le_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_le_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_le_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_le_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_le_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_le_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_le_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_lg_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_lg_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_lg_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_lt_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_lt_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_lt_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_lt_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_lt_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_lt_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_lt_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_lt_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_lt_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_ne_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_ne_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_ne_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_ne_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_ne_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_ne_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_neq_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_neq_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_neq_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_nge_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_nge_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_nge_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_ngt_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_ngt_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_ngt_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_nle_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_nle_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_nle_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_nlg_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_nlg_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_nlg_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_nlt_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_nlt_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_nlt_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_o_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_o_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_o_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_t_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_t_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_t_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_t_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_t_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_t_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_t_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_u_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_u_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmp_u_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_class_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_class_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_class_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_eq_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_eq_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_eq_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_eq_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_eq_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_eq_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_eq_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_eq_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_eq_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_f_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_f_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_f_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_f_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_f_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_f_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_f_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_ge_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_ge_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_ge_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_ge_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_ge_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_ge_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_ge_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_ge_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_ge_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_gt_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_gt_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_gt_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_gt_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_gt_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_gt_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_gt_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_gt_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_gt_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_le_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_le_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_le_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_le_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_le_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_le_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_le_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_le_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_le_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_lg_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_lg_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_lg_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_lt_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_lt_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_lt_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_lt_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_lt_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_lt_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_lt_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_lt_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_lt_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_ne_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_ne_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_ne_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_ne_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_ne_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_ne_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_neq_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_neq_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_neq_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_nge_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_nge_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_nge_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_ngt_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_ngt_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_ngt_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_nle_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_nle_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_nle_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_nlg_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_nlg_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_nlg_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_nlt_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_nlt_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_nlt_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_o_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_o_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_o_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_t_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_t_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_t_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_t_i32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_t_i64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_t_u32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_t_u64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_u_f16 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_u_f32 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+ v_cmpx_u_f64 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`, :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>` :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+
+VOPD
+----
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST0** **DST1** **SRC0** **SRC1** **SRC2** **SRC3** **SRC4** **SRC5**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ v_dual_add_f32_x_add_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_add_f32_x_add_nc_u32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_add_f32_x_and_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_add_f32_x_cndmask_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_add_f32_x_dot2acc_f32_bf16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_add_f32_x_dot2acc_f32_f16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_add_f32_x_fmaak_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_add_f32_x_fmac_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_add_f32_x_fmamk_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_add_f32_x_lshlrev_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_add_f32_x_max_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_add_f32_x_min_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_add_f32_x_mov_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+ v_dual_add_f32_x_mul_dx9_zero_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_add_f32_x_mul_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_add_f32_x_sub_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_add_f32_x_subrev_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_cndmask_b32_x_add_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_cndmask_b32_x_add_nc_u32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_cndmask_b32_x_and_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_cndmask_b32_x_cndmask_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_cndmask_b32_x_dot2acc_f32_bf16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_cndmask_b32_x_dot2acc_f32_f16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_cndmask_b32_x_fmaak_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_cndmask_b32_x_fmac_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_cndmask_b32_x_fmamk_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_cndmask_b32_x_lshlrev_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_cndmask_b32_x_max_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_cndmask_b32_x_min_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_cndmask_b32_x_mov_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_cndmask_b32_x_mul_dx9_zero_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_cndmask_b32_x_mul_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_cndmask_b32_x_sub_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_cndmask_b32_x_subrev_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_dot2acc_f32_bf16_x_add_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_bf16_x_add_nc_u32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_bf16_x_and_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_bf16_x_cndmask_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_dot2acc_f32_bf16_x_dot2acc_f32_bf16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_bf16_x_dot2acc_f32_f16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_bf16_x_fmaak_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_dot2acc_f32_bf16_x_fmac_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_bf16_x_fmamk_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_dot2acc_f32_bf16_x_lshlrev_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_bf16_x_max_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_bf16_x_min_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_bf16_x_mov_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+ v_dual_dot2acc_f32_bf16_x_mul_dx9_zero_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_bf16_x_mul_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_bf16_x_sub_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_bf16_x_subrev_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_f16_x_add_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_f16_x_add_nc_u32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_f16_x_and_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_f16_x_cndmask_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_dot2acc_f32_f16_x_dot2acc_f32_bf16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_f16_x_dot2acc_f32_f16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_f16_x_fmaak_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_dot2acc_f32_f16_x_fmac_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_f16_x_fmamk_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_dot2acc_f32_f16_x_lshlrev_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_f16_x_max_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_f16_x_min_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_f16_x_mov_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+ v_dual_dot2acc_f32_f16_x_mul_dx9_zero_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_f16_x_mul_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_f16_x_sub_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_dot2acc_f32_f16_x_subrev_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_fmaak_f32_x_add_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmaak_f32_x_add_nc_u32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmaak_f32_x_and_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmaak_f32_x_cndmask_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmaak_f32_x_dot2acc_f32_bf16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmaak_f32_x_dot2acc_f32_f16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmaak_f32_x_fmaak_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmaak_f32_x_fmac_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmaak_f32_x_fmamk_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmaak_f32_x_lshlrev_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmaak_f32_x_max_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmaak_f32_x_min_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmaak_f32_x_mov_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmaak_f32_x_mul_dx9_zero_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmaak_f32_x_mul_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmaak_f32_x_sub_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmaak_f32_x_subrev_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmac_f32_x_add_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_fmac_f32_x_add_nc_u32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_fmac_f32_x_and_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_fmac_f32_x_cndmask_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_fmac_f32_x_dot2acc_f32_bf16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_fmac_f32_x_dot2acc_f32_f16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_fmac_f32_x_fmaak_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmac_f32_x_fmac_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_fmac_f32_x_fmamk_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmac_f32_x_lshlrev_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_fmac_f32_x_max_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_fmac_f32_x_min_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_fmac_f32_x_mov_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+ v_dual_fmac_f32_x_mul_dx9_zero_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_fmac_f32_x_mul_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_fmac_f32_x_sub_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_fmac_f32_x_subrev_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_fmamk_f32_x_add_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmamk_f32_x_add_nc_u32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmamk_f32_x_and_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmamk_f32_x_cndmask_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmamk_f32_x_dot2acc_f32_bf16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmamk_f32_x_dot2acc_f32_f16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmamk_f32_x_fmaak_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmamk_f32_x_fmac_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmamk_f32_x_fmamk_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmamk_f32_x_lshlrev_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmamk_f32_x_max_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmamk_f32_x_min_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmamk_f32_x_mov_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmamk_f32_x_mul_dx9_zero_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmamk_f32_x_mul_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmamk_f32_x_sub_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmamk_f32_x_subrev_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_max_num_f32_x_add_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_max_num_f32_x_add_nc_u32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_max_num_f32_x_and_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_max_num_f32_x_cndmask_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_max_num_f32_x_dot2acc_f32_bf16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_max_num_f32_x_dot2acc_f32_f16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_max_num_f32_x_fmaak_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_max_num_f32_x_fmac_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_max_num_f32_x_fmamk_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_max_num_f32_x_lshlrev_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_max_num_f32_x_max_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_max_num_f32_x_min_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_max_num_f32_x_mov_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+ v_dual_max_num_f32_x_mul_dx9_zero_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_max_num_f32_x_mul_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_max_num_f32_x_sub_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_max_num_f32_x_subrev_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_min_num_f32_x_add_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_min_num_f32_x_add_nc_u32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_min_num_f32_x_and_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_min_num_f32_x_cndmask_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_min_num_f32_x_dot2acc_f32_bf16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_min_num_f32_x_dot2acc_f32_f16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_min_num_f32_x_fmaak_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_min_num_f32_x_fmac_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_min_num_f32_x_fmamk_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_min_num_f32_x_lshlrev_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_min_num_f32_x_max_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_min_num_f32_x_min_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_min_num_f32_x_mov_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+ v_dual_min_num_f32_x_mul_dx9_zero_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_min_num_f32_x_mul_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_min_num_f32_x_sub_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_min_num_f32_x_subrev_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mov_b32_x_add_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mov_b32_x_add_nc_u32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mov_b32_x_and_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mov_b32_x_cndmask_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_mov_b32_x_dot2acc_f32_bf16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mov_b32_x_dot2acc_f32_f16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mov_b32_x_fmaak_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_mov_b32_x_fmac_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mov_b32_x_fmamk_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_mov_b32_x_lshlrev_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mov_b32_x_max_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mov_b32_x_min_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mov_b32_x_mov_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+ v_dual_mov_b32_x_mul_dx9_zero_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mov_b32_x_mul_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mov_b32_x_sub_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mov_b32_x_subrev_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_dx9_zero_f32_x_add_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_dx9_zero_f32_x_add_nc_u32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_dx9_zero_f32_x_and_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_dx9_zero_f32_x_cndmask_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_mul_dx9_zero_f32_x_dot2acc_f32_bf16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_dx9_zero_f32_x_dot2acc_f32_f16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_dx9_zero_f32_x_fmaak_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_mul_dx9_zero_f32_x_fmac_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_dx9_zero_f32_x_fmamk_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_mul_dx9_zero_f32_x_lshlrev_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_dx9_zero_f32_x_max_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_dx9_zero_f32_x_min_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_dx9_zero_f32_x_mov_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+ v_dual_mul_dx9_zero_f32_x_mul_dx9_zero_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_dx9_zero_f32_x_mul_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_dx9_zero_f32_x_sub_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_dx9_zero_f32_x_subrev_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_f32_x_add_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_f32_x_add_nc_u32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_f32_x_and_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_f32_x_cndmask_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_mul_f32_x_dot2acc_f32_bf16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_f32_x_dot2acc_f32_f16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_f32_x_fmaak_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_mul_f32_x_fmac_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_f32_x_fmamk_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_mul_f32_x_lshlrev_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_f32_x_max_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_f32_x_min_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_f32_x_mov_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+ v_dual_mul_f32_x_mul_dx9_zero_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_f32_x_mul_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_f32_x_sub_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_mul_f32_x_subrev_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_sub_f32_x_add_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_sub_f32_x_add_nc_u32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_sub_f32_x_and_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_sub_f32_x_cndmask_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_sub_f32_x_dot2acc_f32_bf16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_sub_f32_x_dot2acc_f32_f16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_sub_f32_x_fmaak_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_sub_f32_x_fmac_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_sub_f32_x_fmamk_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_sub_f32_x_lshlrev_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_sub_f32_x_max_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_sub_f32_x_min_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_sub_f32_x_mov_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+ v_dual_sub_f32_x_mul_dx9_zero_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_sub_f32_x_mul_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_sub_f32_x_sub_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_sub_f32_x_subrev_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_subrev_f32_x_add_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_subrev_f32_x_add_nc_u32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_subrev_f32_x_and_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_subrev_f32_x_cndmask_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_subrev_f32_x_dot2acc_f32_bf16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_subrev_f32_x_dot2acc_f32_f16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_subrev_f32_x_fmaak_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_subrev_f32_x_fmac_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_subrev_f32_x_fmamk_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_subrev_f32_x_lshlrev_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_subrev_f32_x_max_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_subrev_f32_x_min_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_subrev_f32_x_mov_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+ v_dual_subrev_f32_x_mul_dx9_zero_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_subrev_f32_x_mul_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_subrev_f32_x_sub_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_subrev_f32_x_subrev_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+
+VOPDX
+-----
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ v_dual_add_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+ v_dual_cndmask_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+ v_dual_dot2acc_f32_bf16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+ v_dual_dot2acc_f32_f16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+ v_dual_fmaak_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+ v_dual_fmac_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+ v_dual_fmamk_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+ v_dual_max_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+ v_dual_min_num_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+ v_dual_mov_b32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`
+ v_dual_mul_dx9_zero_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+ v_dual_mul_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+ v_dual_sub_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+ v_dual_subrev_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+
+VOPDY
+-----
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST** **SRC0** **SRC1**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ v_dual_add_nc_u32 :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_and_b32 :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+ v_dual_lshlrev_b32 :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`, :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+
+VSAMPLE
+-------
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **MODIFIERS**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ image_gather4 :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_gather4_b :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_gather4_b_cl :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_gather4_c :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_gather4_c_b :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_gather4_c_b_cl :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_gather4_c_cl :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_gather4_c_l :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_gather4_c_lz :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_gather4_c_lz_o :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_gather4_cl :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_gather4_l :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_gather4_lz :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_gather4_lz_o :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_gather4_o :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_gather4h :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_get_lod :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_msaa_load :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_b :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_b_cl :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_b_cl_o :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_b_o :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c_b :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c_b_cl :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c_b_cl_o :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c_b_o :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c_cl :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c_cl_o :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c_d :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c_d_cl :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c_d_cl_g16 :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c_d_cl_o :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c_d_cl_o_g16 :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c_d_g16 :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c_d_o :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c_d_o_g16 :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c_l :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c_l_o :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c_lz :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c_lz_o :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_c_o :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_cl :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_cl_o :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_d :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_d_cl :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_d_cl_g16 :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_d_cl_o :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_d_cl_o_g16 :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_d_g16 :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_d_o :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_d_o_g16 :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_l :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_l_o :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_lz :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_lz_o :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ image_sample_o :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>` :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+
+VSCRATCH
+--------
+
+.. parsed-literal::
+
+ **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **MODIFIERS**
+ \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+ scratch_load_b128 :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_load_b32 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_load_b64 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_load_b96 :ref:`vdst<amdgpu_synid_gfx12_vdst_48e42f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_load_block :ref:`vdst<amdgpu_synid_gfx12_vdst_2eda77>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_load_d16_b16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_load_d16_hi_b16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_load_d16_hi_i8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_load_d16_hi_u8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_load_d16_i8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_load_d16_u8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_load_i16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_load_i8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_load_lds_b32 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_load_lds_i16 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_load_lds_i8 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_load_lds_u16 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_load_lds_u8 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_load_u16 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_load_u8 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_store_b128 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_e016a1>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_store_b16 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_store_b32 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_store_b64 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_store_b8 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_store_b96 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_56f215>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_store_block :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_89fd7b>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_store_d16_hi_b16 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+ scratch_store_d16_hi_b8 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`, :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>` :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+
+.. |---| unicode:: U+02014 .. em dash
+
+.. toctree::
+ :hidden:
+
+ gfx12_addr
+ gfx12_attr
+ gfx12_data0_56f215
+ gfx12_data0_6802ce
+ gfx12_data0_e016a1
+ gfx12_data0_fd235e
+ gfx12_data1_6802ce
+ gfx12_data1_731030
+ gfx12_data1_e016a1
+ gfx12_data1_fd235e
+ gfx12_ioffset
+ gfx12_literal_1f74c7
+ gfx12_literal_81e671
+ gfx12_m
+ gfx12_rsrc_5fe6d8
+ gfx12_rsrc_c9f929
+ gfx12_saddr_cdc95c
+ gfx12_saddr_d42b64
+ gfx12_samp
+ gfx12_sbase_453b95
+ gfx12_sbase_47adb7
+ gfx12_sdata_0974a4
+ gfx12_sdata_354189
+ gfx12_sdata_4585b8
+ gfx12_sdata_5c7b50
+ gfx12_sdata_6c003b
+ gfx12_sdata_836716
+ gfx12_sdata_d725ab
+ gfx12_sdata_dd9dd8
+ gfx12_sdst_006c40
+ gfx12_sdst_20064d
+ gfx12_sdst_354189
+ gfx12_sdst_836716
+ gfx12_sdst_ced58d
+ gfx12_sdst_e701cc
+ gfx12_simm16_15ccdd
+ gfx12_simm16_218bea
+ gfx12_simm16_39b593
+ gfx12_simm16_3d2a4f
+ gfx12_simm16_730a13
+ gfx12_simm16_7ed651
+ gfx12_simm16_81e671
+ gfx12_simm16_c98889
+ gfx12_simm16_cc1716
+ gfx12_simm16_ee8b30
+ gfx12_soffset_8ec073
+ gfx12_soffset_c5b88c
+ gfx12_soffset_ec005a
+ gfx12_src0_5727cf
+ gfx12_src0_5cae62
+ gfx12_src0_6802ce
+ gfx12_src0_85aab6
+ gfx12_src0_c4593f
+ gfx12_src0_e016a1
+ gfx12_src0_fd235e
+ gfx12_src1_5727cf
+ gfx12_src1_5cae62
+ gfx12_src1_6802ce
+ gfx12_src1_731030
+ gfx12_src1_977794
+ gfx12_src1_c4593f
+ gfx12_src1_e016a1
+ gfx12_src1_fd235e
+ gfx12_src2_2797bc
+ gfx12_src2_5727cf
+ gfx12_src2_5cae62
+ gfx12_src2_6802ce
+ gfx12_src2_7b936a
+ gfx12_src2_96fbd3
+ gfx12_src2_c4593f
+ gfx12_src2_e016a1
+ gfx12_srcx0
+ gfx12_srcy0
+ gfx12_ssrc0_007f9c
+ gfx12_ssrc0_1a9ca5
+ gfx12_ssrc0_245536
+ gfx12_ssrc0_2797bc
+ gfx12_ssrc0_bbb4c6
+ gfx12_ssrc0_c4593f
+ gfx12_ssrc1_bbb4c6
+ gfx12_ssrc1_c4593f
+ gfx12_tgt
+ gfx12_vaddr_a972b9
+ gfx12_vaddr_c12f43
+ gfx12_vaddr_c8b8d4
+ gfx12_vaddr_d82160
+ gfx12_vaddr_f2b449
+ gfx12_vcc
+ gfx12_vdata_2eda77
+ gfx12_vdata_48e42f
+ gfx12_vdata_69a144
+ gfx12_vdata_89680f
+ gfx12_vdata_aac3e8
+ gfx12_vdata_bdb32f
+ gfx12_vdst_006c40
+ gfx12_vdst_227281
+ gfx12_vdst_2eda77
+ gfx12_vdst_47d3bc
+ gfx12_vdst_48e42f
+ gfx12_vdst_69a144
+ gfx12_vdst_7de8e7
+ gfx12_vdst_836716
+ gfx12_vdst_89680f
+ gfx12_vdst_bdb32f
+ gfx12_vdstx
+ gfx12_vdsty
+ gfx12_vsrc0
+ gfx12_vsrc1_6802ce
+ gfx12_vsrc1_fd235e
+ gfx12_vsrc2
+ gfx12_vsrc3
+ gfx12_vsrc_56f215
+ gfx12_vsrc_6802ce
+ gfx12_vsrc_89fd7b
+ gfx12_vsrc_e016a1
+ gfx12_vsrc_fd235e
+ gfx12_vsrcx1
+ gfx12_vsrcy1
+ gfx12_clause
+ gfx12_delay
+ gfx12_hwreg
+ gfx12_imm16
+ gfx12_label
+ gfx12_sendmsg
+ gfx12_sendmsg_rtn
+ gfx12_version
+ gfx12_waitcnt
diff --git a/llvm/docs/AMDGPU/gfx12_addr.rst b/llvm/docs/AMDGPU/gfx12_addr.rst
new file mode 100644
index 0000000..d2fc0e0
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_addr.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_addr:
+
+addr
+====
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_attr.rst b/llvm/docs/AMDGPU/gfx12_attr.rst
new file mode 100644
index 0000000..a6c5c27
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_attr.rst
@@ -0,0 +1,28 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_attr:
+
+attr
+====
+
+Interpolation attribute and channel:
+
+ ============== ===================================
+ Syntax Description
+ ============== ===================================
+ attr{0..32}.x Attribute 0..32 with *x* channel.
+ attr{0..32}.y Attribute 0..32 with *y* channel.
+ attr{0..32}.z Attribute 0..32 with *z* channel.
+ attr{0..32}.w Attribute 0..32 with *w* channel.
+ ============== ===================================
+
+Examples:
+
+.. parsed-literal::
+
+ ds_param_load v1, attr0.x
diff --git a/llvm/docs/AMDGPU/gfx12_clause.rst b/llvm/docs/AMDGPU/gfx12_clause.rst
new file mode 100644
index 0000000..88feb3b
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_clause.rst
@@ -0,0 +1,7 @@
+.. _amdgpu_synid_clause:
+
+clause
+======
+
+Description of a clause following this instruction.
+
diff --git a/llvm/docs/AMDGPU/gfx12_data0_56f215.rst b/llvm/docs/AMDGPU/gfx12_data0_56f215.rst
new file mode 100644
index 0000000..d8dde00
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_data0_56f215.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_data0_56f215:
+
+data0
+=====
+
+Instruction input.
+
+*Size:* 3 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_data0_6802ce.rst b/llvm/docs/AMDGPU/gfx12_data0_6802ce.rst
new file mode 100644
index 0000000..02fe36f
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_data0_6802ce.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_data0_6802ce:
+
+data0
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_data0_e016a1.rst b/llvm/docs/AMDGPU/gfx12_data0_e016a1.rst
new file mode 100644
index 0000000..914715b
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_data0_e016a1.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_data0_e016a1:
+
+data0
+=====
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_data0_fd235e.rst b/llvm/docs/AMDGPU/gfx12_data0_fd235e.rst
new file mode 100644
index 0000000..7617c61
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_data0_fd235e.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_data0_fd235e:
+
+data0
+=====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_data1_6802ce.rst b/llvm/docs/AMDGPU/gfx12_data1_6802ce.rst
new file mode 100644
index 0000000..318db2d
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_data1_6802ce.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_data1_6802ce:
+
+data1
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_data1_731030.rst b/llvm/docs/AMDGPU/gfx12_data1_731030.rst
new file mode 100644
index 0000000..1a6eda6
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_data1_731030.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_data1_731030:
+
+data1
+=====
+
+Instruction input.
+
+*Size:* 8 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_data1_e016a1.rst b/llvm/docs/AMDGPU/gfx12_data1_e016a1.rst
new file mode 100644
index 0000000..dee4148
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_data1_e016a1.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_data1_e016a1:
+
+data1
+=====
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_data1_fd235e.rst b/llvm/docs/AMDGPU/gfx12_data1_fd235e.rst
new file mode 100644
index 0000000..c8d4a88
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_data1_fd235e.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_data1_fd235e:
+
+data1
+=====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_delay.rst b/llvm/docs/AMDGPU/gfx12_delay.rst
new file mode 100644
index 0000000..600ece7
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_delay.rst
@@ -0,0 +1,74 @@
+.. _amdgpu_synid_delay:
+
+delay
+=====
+
+A delay between dependent SALU/VALU instructions.
+This operand may specify a delay for 2 instructions:
+the one after the current *s_delay_alu* instruction
+and for the second instruction indicated by *SKIP*.
+
+The bits of this operand have the following meaning:
+
+ ===== ========================================================== ============
+ Bits Description Value Range
+ ===== ========================================================== ============
+ 3:0 ID0: indicates a delay for the first instruction. 0..11
+ 6:4 SKIP: indicates the position of the second instruction. 0..5
+ 10:7 ID1: indicates a delay for the second instruction. 0..11
+ ===== ========================================================== ============
+
+This operand may be specified as one of the following:
+
+* An :ref:`integer_number<amdgpu_synid_integer_number>` or an :ref:`absolute_expression<amdgpu_synid_absolute_expression>`. The value must be in the range 0..0xFFFF.
+* A combination of *instid0*, *instskip*, *instid1* values described below.
+
+ ======================== =========================== ===============
+ Syntax Description Default Value
+ ======================== =========================== ===============
+ instid0(<*ID name*>) A symbolic *ID0* value. instid0(NO_DEP)
+ instskip(<*SKIP name*>) A symbolic *SKIP* value. instskip(SAME)
+ instid1(<*ID name*>) A symbolic *ID1* value. instid1(NO_DEP)
+ ======================== =========================== ===============
+
+These values may be specified in any order.
+When more than one value is specified, the values must be separated from each other by a '|'.
+
+Valid *ID names* are defined below.
+
+ =================== ===================================================================
+ Name Description
+ =================== ===================================================================
+ NO_DEP No dependency on any prior instruction. This is the default value.
+ VALU_DEP_1 Dependency on a previous VALU instruction, 1 opcode back.
+ VALU_DEP_2 Dependency on a previous VALU instruction, 2 opcodes back.
+ VALU_DEP_3 Dependency on a previous VALU instruction, 3 opcodes back.
+ VALU_DEP_4 Dependency on a previous VALU instruction, 4 opcodes back.
+ TRANS32_DEP_1 Dependency on a previous TRANS32 instruction, 1 opcode back.
+ TRANS32_DEP_2 Dependency on a previous TRANS32 instruction, 2 opcodes back.
+ TRANS32_DEP_3 Dependency on a previous TRANS32 instruction, 3 opcodes back.
+ FMA_ACCUM_CYCLE_1 Single cycle penalty for FMA accumulation.
+ SALU_CYCLE_1 1 cycle penalty for a prior SALU instruction.
+ SALU_CYCLE_2 2 cycle penalty for a prior SALU instruction.
+ SALU_CYCLE_3 3 cycle penalty for a prior SALU instruction.
+ =================== ===================================================================
+
+Legal *SKIP names* are described in the following table.
+
+ ======== ============================================================================
+ Name Description
+ ======== ============================================================================
+ SAME Apply second dependency to the same instruction. This is the default value.
+ NEXT Apply second dependency to the next instruction.
+ SKIP_1 Skip 1 instruction then apply dependency.
+ SKIP_2 Skip 2 instructions then apply dependency.
+ SKIP_3 Skip 3 instructions then apply dependency.
+ SKIP_4 Skip 4 instructions then apply dependency.
+ ======== ============================================================================
+
+Examples:
+
+.. parsed-literal::
+
+ s_delay_alu instid0(VALU_DEP_1)
+ s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
diff --git a/llvm/docs/AMDGPU/gfx12_hwreg.rst b/llvm/docs/AMDGPU/gfx12_hwreg.rst
new file mode 100644
index 0000000..d99cb20
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_hwreg.rst
@@ -0,0 +1,76 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_hwreg:
+
+hwreg
+=====
+
+Bits of a hardware register being accessed.
+
+The bits of this operand have the following meaning:
+
+ ======= ===================== ============
+ Bits Description Value Range
+ ======= ===================== ============
+ 5:0 Register *id*. 0..63
+ 10:6 First bit *offset*. 0..31
+ 15:11 *Size* in bits. 1..32
+ ======= ===================== ============
+
+This operand may be specified as one of the following:
+
+* An :ref:`integer_number<amdgpu_synid_integer_number>` or an :ref:`absolute_expression<amdgpu_synid_absolute_expression>`. The value must be in the range 0..0xFFFF.
+* An *hwreg* value described below.
+
+ ==================================== ============================================================================
+ Hwreg Value Syntax Description
+ ==================================== ============================================================================
+ hwreg({0..63}) All bits of a register indicated by its *id*.
+ hwreg(<*name*>) All bits of a register indicated by its *name*.
+ hwreg({0..63}, {0..31}, {1..32}) Register bits indicated by register *id*, first bit *offset* and *size*.
+ hwreg(<*name*>, {0..31}, {1..32}) Register bits indicated by register *name*, first bit *offset* and *size*.
+ ==================================== ============================================================================
+
+Numeric values may be specified as positive :ref:`integer numbers<amdgpu_synid_integer_number>`
+or :ref:`absolute expressions<amdgpu_synid_absolute_expression>`.
+
+Defined register *names* include:
+
+ =================== ==========================================
+ Name Description
+ =================== ==========================================
+ HW_REG_MODE Shader writeable mode bits.
+ HW_REG_STATUS Shader read-only status.
+ HW_REG_TRAPSTS Trap status.
+ HW_REG_HW_ID1 Id of wave, simd, compute unit, etc.
+ HW_REG_HW_ID2 Id of queue, pipeline, etc.
+ HW_REG_GPR_ALLOC Per-wave SGPR and VGPR allocation.
+ HW_REG_LDS_ALLOC Per-wave LDS allocation.
+ HW_REG_IB_STS Counters of outstanding instructions.
+ HW_REG_SH_MEM_BASES Memory aperture.
+ HW_REG_FLAT_SCR_LO flat_scratch_lo register.
+ HW_REG_FLAT_SCR_HI flat_scratch_hi register.
+ =================== ==========================================
+
+Examples:
+
+.. parsed-literal::
+
+ reg = 1
+ offset = 2
+ size = 4
+ hwreg_enc = reg | (offset << 6) | ((size - 1) << 11)
+
+ s_getreg_b32 s2, 0x1881
+ s_getreg_b32 s2, hwreg_enc // the same as above
+ s_getreg_b32 s2, hwreg(1, 2, 4) // the same as above
+ s_getreg_b32 s2, hwreg(reg, offset, size) // the same as above
+
+ s_getreg_b32 s2, hwreg(15)
+ s_getreg_b32 s2, hwreg(51, 1, 31)
+ s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1)
diff --git a/llvm/docs/AMDGPU/gfx12_imm16.rst b/llvm/docs/AMDGPU/gfx12_imm16.rst
new file mode 100644
index 0000000..44e6d58
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_imm16.rst
@@ -0,0 +1,7 @@
+.. _amdgpu_synid_imm16:
+
+imm16
+======
+
+An :ref:`integer_number<amdgpu_synid_integer_number>` or an :ref:`absolute_expression<amdgpu_synid_absolute_expression>`. The value must be in the range -32768..65535.
+
diff --git a/llvm/docs/AMDGPU/gfx12_ioffset.rst b/llvm/docs/AMDGPU/gfx12_ioffset.rst
new file mode 100644
index 0000000..0901b77
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_ioffset.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_ioffset:
+
+ioffset
+=======
+
+*Size:* 1 dword.
+
+*Operands:*
diff --git a/llvm/docs/AMDGPU/gfx12_label.rst b/llvm/docs/AMDGPU/gfx12_label.rst
new file mode 100644
index 0000000..bdd6e1c
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_label.rst
@@ -0,0 +1,29 @@
+.. _amdgpu_synid_label:
+
+label
+=====
+
+A branch target which is a 16-bit signed integer treated as a PC-relative dword offset.
+
+This operand may be specified as one of the following:
+
+* An :ref:`integer_number<amdgpu_synid_integer_number>` or an :ref:`absolute_expression<amdgpu_synid_absolute_expression>`. The value must be in the range -32768..65535.
+* A :ref:`symbol<amdgpu_synid_symbol>` (for example, a label) representing a relocatable address in the same compilation unit where it is referred from. The value is handled as a 16-bit PC-relative dword offset to be resolved by a linker.
+
+Examples:
+
+.. parsed-literal::
+
+ offset = 30
+ label_1:
+ label_2 = . + 4
+
+ s_branch 32
+ s_branch offset + 2
+ s_branch label_1
+ s_branch label_2
+ s_branch label_3
+ s_branch label_4
+
+ label_3 = label_2 + 4
+ label_4:
diff --git a/llvm/docs/AMDGPU/gfx12_literal_1f74c7.rst b/llvm/docs/AMDGPU/gfx12_literal_1f74c7.rst
new file mode 100644
index 0000000..7442c5d
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_literal_1f74c7.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_literal_1f74c7:
+
+literal
+=======
+
+*Size:* 2 dwords.
+
+*Operands:*
diff --git a/llvm/docs/AMDGPU/gfx12_literal_81e671.rst b/llvm/docs/AMDGPU/gfx12_literal_81e671.rst
new file mode 100644
index 0000000..ab1b056
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_literal_81e671.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_literal_81e671:
+
+literal
+=======
+
+*Size:* 1 dword.
+
+*Operands:*
diff --git a/llvm/docs/AMDGPU/gfx12_m.rst b/llvm/docs/AMDGPU/gfx12_m.rst
new file mode 100644
index 0000000..7cfee90
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_m.rst
@@ -0,0 +1,13 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_m:
+
+m
+=
+
+This operand may be used with floating point operand modifiers :ref:`abs<amdgpu_synid_abs>` and :ref:`neg<amdgpu_synid_neg>`.
diff --git a/llvm/docs/AMDGPU/gfx12_rsrc_5fe6d8.rst b/llvm/docs/AMDGPU/gfx12_rsrc_5fe6d8.rst
new file mode 100644
index 0000000..d1a475f
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_rsrc_5fe6d8.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_rsrc_5fe6d8:
+
+rsrc
+====
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_rsrc_c9f929.rst b/llvm/docs/AMDGPU/gfx12_rsrc_c9f929.rst
new file mode 100644
index 0000000..180ae06
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_rsrc_c9f929.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_rsrc_c9f929:
+
+rsrc
+====
+
+Instruction input.
+
+*Size:* 8 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_saddr_cdc95c.rst b/llvm/docs/AMDGPU/gfx12_saddr_cdc95c.rst
new file mode 100644
index 0000000..4b3511f
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_saddr_cdc95c.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_saddr_cdc95c:
+
+saddr
+=====
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_saddr_d42b64.rst b/llvm/docs/AMDGPU/gfx12_saddr_d42b64.rst
new file mode 100644
index 0000000..d3de11d
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_saddr_d42b64.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_saddr_d42b64:
+
+saddr
+=====
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_samp.rst b/llvm/docs/AMDGPU/gfx12_samp.rst
new file mode 100644
index 0000000..2bb15e5
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_samp.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_samp:
+
+samp
+====
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sbase_453b95.rst b/llvm/docs/AMDGPU/gfx12_sbase_453b95.rst
new file mode 100644
index 0000000..54c2dee
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sbase_453b95.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_sbase_453b95:
+
+sbase
+=====
+
+A 128-bit buffer resource constant for scalar memory operations which provides a base address, a size and a stride.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sbase_47adb7.rst b/llvm/docs/AMDGPU/gfx12_sbase_47adb7.rst
new file mode 100644
index 0000000..2308b3d
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sbase_47adb7.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_sbase_47adb7:
+
+sbase
+=====
+
+A 64-bit base address for scalar memory operations.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdata_0974a4.rst b/llvm/docs/AMDGPU/gfx12_sdata_0974a4.rst
new file mode 100644
index 0000000..d498f8c
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdata_0974a4.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_sdata_0974a4:
+
+sdata
+=====
+
+Instruction output.
+
+*Size:* 8 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdata_354189.rst b/llvm/docs/AMDGPU/gfx12_sdata_354189.rst
new file mode 100644
index 0000000..c506654
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdata_354189.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_sdata_354189:
+
+sdata
+=====
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdata_4585b8.rst b/llvm/docs/AMDGPU/gfx12_sdata_4585b8.rst
new file mode 100644
index 0000000..42f66f3
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdata_4585b8.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_sdata_4585b8:
+
+sdata
+=====
+
+Instruction output.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdata_5c7b50.rst b/llvm/docs/AMDGPU/gfx12_sdata_5c7b50.rst
new file mode 100644
index 0000000..028461a
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdata_5c7b50.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_sdata_5c7b50:
+
+sdata
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:*
diff --git a/llvm/docs/AMDGPU/gfx12_sdata_6c003b.rst b/llvm/docs/AMDGPU/gfx12_sdata_6c003b.rst
new file mode 100644
index 0000000..87e19a9
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdata_6c003b.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_sdata_6c003b:
+
+sdata
+=====
+
+Instruction output.
+
+*Size:* 16 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdata_836716.rst b/llvm/docs/AMDGPU/gfx12_sdata_836716.rst
new file mode 100644
index 0000000..be1bce9
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdata_836716.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_sdata_836716:
+
+sdata
+=====
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdata_d725ab.rst b/llvm/docs/AMDGPU/gfx12_sdata_d725ab.rst
new file mode 100644
index 0000000..c882df8
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdata_d725ab.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_sdata_d725ab:
+
+sdata
+=====
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`simm8<amdgpu_synid_simm8>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdata_dd9dd8.rst b/llvm/docs/AMDGPU/gfx12_sdata_dd9dd8.rst
new file mode 100644
index 0000000..6465889
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdata_dd9dd8.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_sdata_dd9dd8:
+
+sdata
+=====
+
+Instruction output.
+
+*Size:* 3 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdst_006c40.rst b/llvm/docs/AMDGPU/gfx12_sdst_006c40.rst
new file mode 100644
index 0000000..f269b05
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdst_006c40.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_sdst_006c40:
+
+sdst
+====
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`vcc<amdgpu_synid_vcc>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdst_20064d.rst b/llvm/docs/AMDGPU/gfx12_sdst_20064d.rst
new file mode 100644
index 0000000..83c11a2
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdst_20064d.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_sdst_20064d:
+
+sdst
+====
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdst_354189.rst b/llvm/docs/AMDGPU/gfx12_sdst_354189.rst
new file mode 100644
index 0000000..8433406
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdst_354189.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_sdst_354189:
+
+sdst
+====
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdst_836716.rst b/llvm/docs/AMDGPU/gfx12_sdst_836716.rst
new file mode 100644
index 0000000..abce569
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdst_836716.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_sdst_836716:
+
+sdst
+====
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdst_ced58d.rst b/llvm/docs/AMDGPU/gfx12_sdst_ced58d.rst
new file mode 100644
index 0000000..e0072d9
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdst_ced58d.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_sdst_ced58d:
+
+sdst
+====
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdst_e701cc.rst b/llvm/docs/AMDGPU/gfx12_sdst_e701cc.rst
new file mode 100644
index 0000000..33e8c37
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdst_e701cc.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_sdst_e701cc:
+
+sdst
+====
+
+Instruction output.
+
+*Size:* 1 dword if wavefront size is 32, otherwise 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sendmsg.rst b/llvm/docs/AMDGPU/gfx12_sendmsg.rst
new file mode 100644
index 0000000..cb51be0
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sendmsg.rst
@@ -0,0 +1,48 @@
+.. _amdgpu_synid_sendmsg:
+
+sendmsg
+=======
+
+An 8-bit value in simm16[7:0] encodes the message type.
+
+This operand may be specified as one of the following:
+
+* An :ref:`integer_number<amdgpu_synid_integer_number>` or an :ref:`absolute_expression<amdgpu_synid_absolute_expression>`. The value must be in the range 0..0xFFFF.
+* A *sendmsg* value described below.
+
+
+ ==================================== ====================================================
+ Sendmsg Value Syntax Description
+ ==================================== ====================================================
+ sendmsg(<*type*>) A message identified by its *type*.
+ ==================================== ====================================================
+
+*Type* may be specified using message *name* or message *id*.
+
+Numeric values may be specified as positive :ref:`integer numbers<amdgpu_synid_integer_number>`
+or :ref:`absolute expressions<amdgpu_synid_absolute_expression>`.
+
+
+Only the following message types are valid.
+
+ ====================== ===========
+ Message type simm16[7:0]
+ ====================== ===========
+ Reserved 0
+ MSG_INTERRUPT 1
+ MSG_HS_TESSFACTOR 2
+ MSG_DEALLOC_VGPRS 3
+ MSG_GS_ALLOC_REQ 9
+ ====================== ===========
+
+Examples:
+
+.. parsed-literal::
+
+ // numeric message code
+ msg = 0x1
+ s_sendmsg 0x3
+ s_sendmsg msg + 2
+
+ // sendmsg with strict arguments validation
+ s_sendmsg sendmsg(MSG_INTERRUPT)
diff --git a/llvm/docs/AMDGPU/gfx12_sendmsg_rtn.rst b/llvm/docs/AMDGPU/gfx12_sendmsg_rtn.rst
new file mode 100644
index 0000000..ebb591d
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sendmsg_rtn.rst
@@ -0,0 +1,30 @@
+.. _amdgpu_synid_sendmsg_rtn:
+
+sendmsg_rtn
+===========
+
+An 8-bit value in the instruction to encode the message type.
+
+This operand may be specified as one of the following:
+
+ * An :ref:`integer_number<amdgpu_synid_integer_number>` or an :ref:`absolute_expression<amdgpu_synid_absolute_expression>`. The value must be in the range 0..0xFFFF.
+ * A *sendmsg* value described below.
+
+ ==================================== ====================================================
+ Sendmsg Value Syntax Description
+ ==================================== ====================================================
+ sendmsg(MSG_RTN_GET_DOORBELL) Get doorbell ID.
+ sendmsg(MSG_RTN_GET_DDID) Get Draw/Dispatch ID.
+ sendmsg(MSG_RTN_GET_TMA) Get TMA value.
+ sendmsg(MSG_RTN_GET_TBA) Get TBA value.
+ sendmsg(MSG_RTN_GET_REALTIME) Get REALTIME value.
+ sendmsg(MSG_RTN_SAVE_WAVE) Report that this wave is ready to be context-saved.
+ ==================================== ====================================================
+
+Examples:
+
+.. parsed-literal::
+
+ s_sendmsg_rtn_b32 s0, 132
+ s_sendmsg_rtn_b32 s0, sendmsg(MSG_GET_REALTIME)
+
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_15ccdd.rst b/llvm/docs/AMDGPU/gfx12_simm16_15ccdd.rst
new file mode 100644
index 0000000..0cb1233
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_15ccdd.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_15ccdd:
+
+simm16
+======
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`version<amdgpu_synid_version>`
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_218bea.rst b/llvm/docs/AMDGPU/gfx12_simm16_218bea.rst
new file mode 100644
index 0000000..e08605e
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_218bea.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_218bea:
+
+simm16
+======
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`waitcnt<amdgpu_synid_waitcnt>`
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_39b593.rst b/llvm/docs/AMDGPU/gfx12_simm16_39b593.rst
new file mode 100644
index 0000000..babb4b6
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_39b593.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_39b593:
+
+simm16
+======
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`imm16<amdgpu_synid_imm16>`
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_3d2a4f.rst b/llvm/docs/AMDGPU/gfx12_simm16_3d2a4f.rst
new file mode 100644
index 0000000..cc8dbc6
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_3d2a4f.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_3d2a4f:
+
+simm16
+======
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`label<amdgpu_synid_label>`
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_730a13.rst b/llvm/docs/AMDGPU/gfx12_simm16_730a13.rst
new file mode 100644
index 0000000..93596db
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_730a13.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_730a13:
+
+simm16
+======
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`clause<amdgpu_synid_clause>`
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_7ed651.rst b/llvm/docs/AMDGPU/gfx12_simm16_7ed651.rst
new file mode 100644
index 0000000..fc63930
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_7ed651.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_7ed651:
+
+simm16
+======
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`hwreg<amdgpu_synid_hwreg>`
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_81e671.rst b/llvm/docs/AMDGPU/gfx12_simm16_81e671.rst
new file mode 100644
index 0000000..16dcf39
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_81e671.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_81e671:
+
+simm16
+======
+
+*Size:* 1 dword.
+
+*Operands:*
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_c98889.rst b/llvm/docs/AMDGPU/gfx12_simm16_c98889.rst
new file mode 100644
index 0000000..03e007af
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_c98889.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_c98889:
+
+simm16
+======
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`delay<amdgpu_synid_delay>`
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_cc1716.rst b/llvm/docs/AMDGPU/gfx12_simm16_cc1716.rst
new file mode 100644
index 0000000..e53f812
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_cc1716.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_cc1716:
+
+simm16
+======
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`hwreg<amdgpu_synid_hwreg>`
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_ee8b30.rst b/llvm/docs/AMDGPU/gfx12_simm16_ee8b30.rst
new file mode 100644
index 0000000..9bdac9b
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_ee8b30.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_ee8b30:
+
+simm16
+======
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`sendmsg<amdgpu_synid_sendmsg>`
diff --git a/llvm/docs/AMDGPU/gfx12_soffset_8ec073.rst b/llvm/docs/AMDGPU/gfx12_soffset_8ec073.rst
new file mode 100644
index 0000000..44de030
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_soffset_8ec073.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_soffset_8ec073:
+
+soffset
+=======
+
+An unsigned 20-bit offset added to the base address to get memory address.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_soffset_c5b88c.rst b/llvm/docs/AMDGPU/gfx12_soffset_c5b88c.rst
new file mode 100644
index 0000000..d115150
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_soffset_c5b88c.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_soffset_c5b88c:
+
+soffset
+=======
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_soffset_ec005a.rst b/llvm/docs/AMDGPU/gfx12_soffset_ec005a.rst
new file mode 100644
index 0000000..bd571b6
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_soffset_ec005a.rst
@@ -0,0 +1,20 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_soffset_ec005a:
+
+soffset
+=======
+
+An offset added to the base address to get memory address.
+
+* If offset is specified as a register, it supplies an unsigned byte offset.
+* If offset is specified as a 21-bit immediate, it supplies a signed byte offset.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_src0_5727cf.rst b/llvm/docs/AMDGPU/gfx12_src0_5727cf.rst
new file mode 100644
index 0000000..15fde5c
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src0_5727cf.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src0_5727cf:
+
+src0
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_src0_5cae62.rst b/llvm/docs/AMDGPU/gfx12_src0_5cae62.rst
new file mode 100644
index 0000000..fa02f046
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src0_5cae62.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src0_5cae62:
+
+src0
+====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/llvm/docs/AMDGPU/gfx12_src0_6802ce.rst b/llvm/docs/AMDGPU/gfx12_src0_6802ce.rst
new file mode 100644
index 0000000..e17a719
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src0_6802ce.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src0_6802ce:
+
+src0
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_src0_85aab6.rst b/llvm/docs/AMDGPU/gfx12_src0_85aab6.rst
new file mode 100644
index 0000000..effa6f6
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src0_85aab6.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src0_85aab6:
+
+src0
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_src0_c4593f.rst b/llvm/docs/AMDGPU/gfx12_src0_c4593f.rst
new file mode 100644
index 0000000..bbe6191
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src0_c4593f.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src0_c4593f:
+
+src0
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_src0_e016a1.rst b/llvm/docs/AMDGPU/gfx12_src0_e016a1.rst
new file mode 100644
index 0000000..c2d23d7
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src0_e016a1.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src0_e016a1:
+
+src0
+====
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_src0_fd235e.rst b/llvm/docs/AMDGPU/gfx12_src0_fd235e.rst
new file mode 100644
index 0000000..dc048af
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src0_fd235e.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src0_fd235e:
+
+src0
+====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_src1_5727cf.rst b/llvm/docs/AMDGPU/gfx12_src1_5727cf.rst
new file mode 100644
index 0000000..d1d0837
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src1_5727cf.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src1_5727cf:
+
+src1
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_src1_5cae62.rst b/llvm/docs/AMDGPU/gfx12_src1_5cae62.rst
new file mode 100644
index 0000000..3ad591c
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src1_5cae62.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src1_5cae62:
+
+src1
+====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/llvm/docs/AMDGPU/gfx12_src1_6802ce.rst b/llvm/docs/AMDGPU/gfx12_src1_6802ce.rst
new file mode 100644
index 0000000..84ff631
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src1_6802ce.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src1_6802ce:
+
+src1
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_src1_731030.rst b/llvm/docs/AMDGPU/gfx12_src1_731030.rst
new file mode 100644
index 0000000..8c67699
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src1_731030.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src1_731030:
+
+src1
+====
+
+Instruction input.
+
+*Size:* 8 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_src1_977794.rst b/llvm/docs/AMDGPU/gfx12_src1_977794.rst
new file mode 100644
index 0000000..7651340
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src1_977794.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src1_977794:
+
+src1
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_src1_c4593f.rst b/llvm/docs/AMDGPU/gfx12_src1_c4593f.rst
new file mode 100644
index 0000000..aba4da8
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src1_c4593f.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src1_c4593f:
+
+src1
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_src1_e016a1.rst b/llvm/docs/AMDGPU/gfx12_src1_e016a1.rst
new file mode 100644
index 0000000..4385853
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src1_e016a1.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src1_e016a1:
+
+src1
+====
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_src1_fd235e.rst b/llvm/docs/AMDGPU/gfx12_src1_fd235e.rst
new file mode 100644
index 0000000..5863e93
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src1_fd235e.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src1_fd235e:
+
+src1
+====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_src2_2797bc.rst b/llvm/docs/AMDGPU/gfx12_src2_2797bc.rst
new file mode 100644
index 0000000..b393e2a
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src2_2797bc.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src2_2797bc:
+
+src2
+====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_src2_5727cf.rst b/llvm/docs/AMDGPU/gfx12_src2_5727cf.rst
new file mode 100644
index 0000000..9ffaa079
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src2_5727cf.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src2_5727cf:
+
+src2
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_src2_5cae62.rst b/llvm/docs/AMDGPU/gfx12_src2_5cae62.rst
new file mode 100644
index 0000000..46d65cb
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src2_5cae62.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src2_5cae62:
+
+src2
+====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/llvm/docs/AMDGPU/gfx12_src2_6802ce.rst b/llvm/docs/AMDGPU/gfx12_src2_6802ce.rst
new file mode 100644
index 0000000..0ad2ede
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src2_6802ce.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src2_6802ce:
+
+src2
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_src2_7b936a.rst b/llvm/docs/AMDGPU/gfx12_src2_7b936a.rst
new file mode 100644
index 0000000..9f1ea3c
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src2_7b936a.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src2_7b936a:
+
+src2
+====
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`fconst<amdgpu_synid_fconst>`
diff --git a/llvm/docs/AMDGPU/gfx12_src2_96fbd3.rst b/llvm/docs/AMDGPU/gfx12_src2_96fbd3.rst
new file mode 100644
index 0000000..884d089
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src2_96fbd3.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src2_96fbd3:
+
+src2
+====
+
+Instruction input.
+
+*Size:* 8 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`fconst<amdgpu_synid_fconst>`
diff --git a/llvm/docs/AMDGPU/gfx12_src2_c4593f.rst b/llvm/docs/AMDGPU/gfx12_src2_c4593f.rst
new file mode 100644
index 0000000..849230b
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src2_c4593f.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src2_c4593f:
+
+src2
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_src2_e016a1.rst b/llvm/docs/AMDGPU/gfx12_src2_e016a1.rst
new file mode 100644
index 0000000..266c4ea
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src2_e016a1.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_src2_e016a1:
+
+src2
+====
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_srcx0.rst b/llvm/docs/AMDGPU/gfx12_srcx0.rst
new file mode 100644
index 0000000..57b05a1
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_srcx0.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_srcx0:
+
+srcx0
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_srcy0.rst b/llvm/docs/AMDGPU/gfx12_srcy0.rst
new file mode 100644
index 0000000..350b742
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_srcy0.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_srcy0:
+
+srcy0
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_ssrc0_007f9c.rst b/llvm/docs/AMDGPU/gfx12_ssrc0_007f9c.rst
new file mode 100644
index 0000000..c3f33e4f
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_ssrc0_007f9c.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_ssrc0_007f9c:
+
+ssrc0
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_ssrc0_1a9ca5.rst b/llvm/docs/AMDGPU/gfx12_ssrc0_1a9ca5.rst
new file mode 100644
index 0000000..5aa3f2d
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_ssrc0_1a9ca5.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_ssrc0_1a9ca5:
+
+ssrc0
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`m0<amdgpu_synid_m0>`
diff --git a/llvm/docs/AMDGPU/gfx12_ssrc0_245536.rst b/llvm/docs/AMDGPU/gfx12_ssrc0_245536.rst
new file mode 100644
index 0000000..36925da
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_ssrc0_245536.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_ssrc0_245536:
+
+ssrc0
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`sendmsg_rtn<amdgpu_synid_sendmsg_rtn>`
diff --git a/llvm/docs/AMDGPU/gfx12_ssrc0_2797bc.rst b/llvm/docs/AMDGPU/gfx12_ssrc0_2797bc.rst
new file mode 100644
index 0000000..4eae705
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_ssrc0_2797bc.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_ssrc0_2797bc:
+
+ssrc0
+=====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_ssrc0_bbb4c6.rst b/llvm/docs/AMDGPU/gfx12_ssrc0_bbb4c6.rst
new file mode 100644
index 0000000..a29f83d
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_ssrc0_bbb4c6.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_ssrc0_bbb4c6:
+
+ssrc0
+=====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/llvm/docs/AMDGPU/gfx12_ssrc0_c4593f.rst b/llvm/docs/AMDGPU/gfx12_ssrc0_c4593f.rst
new file mode 100644
index 0000000..33ca4d6
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_ssrc0_c4593f.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_ssrc0_c4593f:
+
+ssrc0
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_ssrc1_bbb4c6.rst b/llvm/docs/AMDGPU/gfx12_ssrc1_bbb4c6.rst
new file mode 100644
index 0000000..1f3ea34
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_ssrc1_bbb4c6.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_ssrc1_bbb4c6:
+
+ssrc1
+=====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/llvm/docs/AMDGPU/gfx12_ssrc1_c4593f.rst b/llvm/docs/AMDGPU/gfx12_ssrc1_c4593f.rst
new file mode 100644
index 0000000..f81d0f2
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_ssrc1_c4593f.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_ssrc1_c4593f:
+
+ssrc1
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_tgt.rst b/llvm/docs/AMDGPU/gfx12_tgt.rst
new file mode 100644
index 0000000..83a25aa
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_tgt.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_tgt:
+
+tgt
+===
+
+Instruction output.
+
+*Size:* 4 dwords.
+
+*Operands:*
diff --git a/llvm/docs/AMDGPU/gfx12_vaddr_a972b9.rst b/llvm/docs/AMDGPU/gfx12_vaddr_a972b9.rst
new file mode 100644
index 0000000..223b50d
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vaddr_a972b9.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vaddr_a972b9:
+
+vaddr
+=====
+
+*Size:* 11 dwords.
+
+*Operands:*
diff --git a/llvm/docs/AMDGPU/gfx12_vaddr_c12f43.rst b/llvm/docs/AMDGPU/gfx12_vaddr_c12f43.rst
new file mode 100644
index 0000000..5a93efe
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vaddr_c12f43.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vaddr_c12f43:
+
+vaddr
+=====
+
+*Size:* 12 dwords.
+
+*Operands:*
diff --git a/llvm/docs/AMDGPU/gfx12_vaddr_c8b8d4.rst b/llvm/docs/AMDGPU/gfx12_vaddr_c8b8d4.rst
new file mode 100644
index 0000000..1998e1d
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vaddr_c8b8d4.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vaddr_c8b8d4:
+
+vaddr
+=====
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vaddr_d82160.rst b/llvm/docs/AMDGPU/gfx12_vaddr_d82160.rst
new file mode 100644
index 0000000..92d09a2
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vaddr_d82160.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vaddr_d82160:
+
+vaddr
+=====
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vaddr_f2b449.rst b/llvm/docs/AMDGPU/gfx12_vaddr_f2b449.rst
new file mode 100644
index 0000000..10d7e0a
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vaddr_f2b449.rst
@@ -0,0 +1,15 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vaddr_f2b449:
+
+vaddr
+=====
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vcc.rst b/llvm/docs/AMDGPU/gfx12_vcc.rst
new file mode 100644
index 0000000..e8509ff
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vcc.rst
@@ -0,0 +1,16 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vcc:
+
+vcc
+===
+
+Vector condition code. This operand depends on wavefront size:
+
+* Should be :ref:`vcc_lo<amdgpu_synid_vcc_lo>` if wavefront size is 32.
+* Should be :ref:`vcc<amdgpu_synid_vcc>` if wavefront size is 64.
diff --git a/llvm/docs/AMDGPU/gfx12_vdata_2eda77.rst b/llvm/docs/AMDGPU/gfx12_vdata_2eda77.rst
new file mode 100644
index 0000000..839ec86
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdata_2eda77.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vdata_2eda77:
+
+vdata
+=====
+
+Instruction output.
+
+*Size:* 32 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdata_48e42f.rst b/llvm/docs/AMDGPU/gfx12_vdata_48e42f.rst
new file mode 100644
index 0000000..d2ab49a
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdata_48e42f.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vdata_48e42f:
+
+vdata
+=====
+
+Instruction output.
+
+*Size:* 3 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdata_69a144.rst b/llvm/docs/AMDGPU/gfx12_vdata_69a144.rst
new file mode 100644
index 0000000..22ac087
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdata_69a144.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vdata_69a144:
+
+vdata
+=====
+
+Instruction output.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdata_89680f.rst b/llvm/docs/AMDGPU/gfx12_vdata_89680f.rst
new file mode 100644
index 0000000..5f4f478
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdata_89680f.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vdata_89680f:
+
+vdata
+=====
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdata_aac3e8.rst b/llvm/docs/AMDGPU/gfx12_vdata_aac3e8.rst
new file mode 100644
index 0000000..2e285ef
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdata_aac3e8.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vdata_aac3e8:
+
+vdata
+=====
+
+Instruction output.
+
+*Size:* 10 dwords.
+
+*Operands:*
diff --git a/llvm/docs/AMDGPU/gfx12_vdata_bdb32f.rst b/llvm/docs/AMDGPU/gfx12_vdata_bdb32f.rst
new file mode 100644
index 0000000..109c767
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdata_bdb32f.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vdata_bdb32f:
+
+vdata
+=====
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_006c40.rst b/llvm/docs/AMDGPU/gfx12_vdst_006c40.rst
new file mode 100644
index 0000000..dc3ac95
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_006c40.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_006c40:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`vcc<amdgpu_synid_vcc>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_227281.rst b/llvm/docs/AMDGPU/gfx12_vdst_227281.rst
new file mode 100644
index 0000000..13fd951
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_227281.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_227281:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 4 dwords if wavefront size is 64, otherwise 8 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_2eda77.rst b/llvm/docs/AMDGPU/gfx12_vdst_2eda77.rst
new file mode 100644
index 0000000..9372e48
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_2eda77.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_2eda77:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 32 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_47d3bc.rst b/llvm/docs/AMDGPU/gfx12_vdst_47d3bc.rst
new file mode 100644
index 0000000..056fe3f
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_47d3bc.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_47d3bc:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 8 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_48e42f.rst b/llvm/docs/AMDGPU/gfx12_vdst_48e42f.rst
new file mode 100644
index 0000000..84ab35b
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_48e42f.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_48e42f:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 3 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_69a144.rst b/llvm/docs/AMDGPU/gfx12_vdst_69a144.rst
new file mode 100644
index 0000000..70873ff
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_69a144.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_69a144:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_7de8e7.rst b/llvm/docs/AMDGPU/gfx12_vdst_7de8e7.rst
new file mode 100644
index 0000000..7248ea9
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_7de8e7.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_7de8e7:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`exec<amdgpu_synid_exec>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_836716.rst b/llvm/docs/AMDGPU/gfx12_vdst_836716.rst
new file mode 100644
index 0000000..1cd43ee9
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_836716.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_836716:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_89680f.rst b/llvm/docs/AMDGPU/gfx12_vdst_89680f.rst
new file mode 100644
index 0000000..b4f055c
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_89680f.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_89680f:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_bdb32f.rst b/llvm/docs/AMDGPU/gfx12_vdst_bdb32f.rst
new file mode 100644
index 0000000..e2a4a47
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_bdb32f.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_bdb32f:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdstx.rst b/llvm/docs/AMDGPU/gfx12_vdstx.rst
new file mode 100644
index 0000000..4b95d4d
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdstx.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vdstx:
+
+vdstx
+=====
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdsty.rst b/llvm/docs/AMDGPU/gfx12_vdsty.rst
new file mode 100644
index 0000000..cf0b464
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdsty.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vdsty:
+
+vdsty
+=====
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_version.rst b/llvm/docs/AMDGPU/gfx12_version.rst
new file mode 100644
index 0000000..4e490ca
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_version.rst
@@ -0,0 +1,7 @@
+.. _amdgpu_synid_version:
+
+version
+=======
+
+Microcode version header.
+
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc0.rst b/llvm/docs/AMDGPU/gfx12_vsrc0.rst
new file mode 100644
index 0000000..fb38169
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc0.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc0:
+
+vsrc0
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc1_6802ce.rst b/llvm/docs/AMDGPU/gfx12_vsrc1_6802ce.rst
new file mode 100644
index 0000000..4490545
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc1_6802ce.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc1_6802ce:
+
+vsrc1
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc1_fd235e.rst b/llvm/docs/AMDGPU/gfx12_vsrc1_fd235e.rst
new file mode 100644
index 0000000..d6567c2
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc1_fd235e.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc1_fd235e:
+
+vsrc1
+=====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc2.rst b/llvm/docs/AMDGPU/gfx12_vsrc2.rst
new file mode 100644
index 0000000..fe20832
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc2.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc2:
+
+vsrc2
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc3.rst b/llvm/docs/AMDGPU/gfx12_vsrc3.rst
new file mode 100644
index 0000000..18df9e4
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc3.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc3:
+
+vsrc3
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc_56f215.rst b/llvm/docs/AMDGPU/gfx12_vsrc_56f215.rst
new file mode 100644
index 0000000..166da38
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc_56f215.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc_56f215:
+
+vsrc
+====
+
+Instruction input.
+
+*Size:* 3 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc_6802ce.rst b/llvm/docs/AMDGPU/gfx12_vsrc_6802ce.rst
new file mode 100644
index 0000000..e879c2b
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc_6802ce.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc_6802ce:
+
+vsrc
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc_89fd7b.rst b/llvm/docs/AMDGPU/gfx12_vsrc_89fd7b.rst
new file mode 100644
index 0000000..c521e72
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc_89fd7b.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc_89fd7b:
+
+vsrc
+====
+
+Instruction input.
+
+*Size:* 32 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc_e016a1.rst b/llvm/docs/AMDGPU/gfx12_vsrc_e016a1.rst
new file mode 100644
index 0000000..84eb2ed
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc_e016a1.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc_e016a1:
+
+vsrc
+====
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc_fd235e.rst b/llvm/docs/AMDGPU/gfx12_vsrc_fd235e.rst
new file mode 100644
index 0000000..640a235
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc_fd235e.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc_fd235e:
+
+vsrc
+====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrcx1.rst b/llvm/docs/AMDGPU/gfx12_vsrcx1.rst
new file mode 100644
index 0000000..9dab58c
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrcx1.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vsrcx1:
+
+vsrcx1
+======
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrcy1.rst b/llvm/docs/AMDGPU/gfx12_vsrcy1.rst
new file mode 100644
index 0000000..496b2d6
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrcy1.rst
@@ -0,0 +1,17 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_gfx12_vsrcy1:
+
+vsrcy1
+======
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_waitcnt.rst b/llvm/docs/AMDGPU/gfx12_waitcnt.rst
new file mode 100644
index 0000000..4541222
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_waitcnt.rst
@@ -0,0 +1,55 @@
+..
+ **************************************************
+ * *
+ * Automatically generated file, do not edit! *
+ * *
+ **************************************************
+
+.. _amdgpu_synid_waitcnt:
+
+waitcnt
+=======
+
+Counts of outstanding instructions to wait for.
+
+The bits of this operand have the following meaning:
+
+ ===== ================================================ ============
+ Bits Description Value Range
+ ===== ================================================ ============
+ 2:0 EXP_CNT: export and LDSDIR count. 0..7
+ 3:3 Unused \-
+ 9:4 LGKM_CNT: LDS, GDS, Constant and Message count. 0..63
+ 15:10 VM_CNT: vector memory operations count. 0..63
+ ===== ================================================ ============
+
+This operand may be specified as one of the following:
+
+* An :ref:`integer_number<amdgpu_synid_integer_number>` or an :ref:`absolute_expression<amdgpu_synid_absolute_expression>`. The value must be in the range 0..0xFFFF.
+* A combination of *vmcnt*, *expcnt*, *lgkmcnt* and other values described below.
+
+ ====================== ======================================================================
+ Syntax Description
+ ====================== ======================================================================
+ vmcnt(<*N*>) A VM_CNT value. *N* must not exceed the largest VM_CNT value.
+ expcnt(<*N*>) An EXP_CNT value. *N* must not exceed the largest EXP_CNT value.
+ lgkmcnt(<*N*>) An LGKM_CNT value. *N* must not exceed the largest LGKM_CNT value.
+ vmcnt_sat(<*N*>) A VM_CNT value computed as min(*N*, the largest VM_CNT value).
+ expcnt_sat(<*N*>) An EXP_CNT value computed as min(*N*, the largest EXP_CNT value).
+ lgkmcnt_sat(<*N*>) An LGKM_CNT value computed as min(*N*, the largest LGKM_CNT value).
+ ====================== ======================================================================
+
+These values may be specified in any order. Spaces, ampersands and commas may be used as optional separators.
+
+*N* is either an
+:ref:`integer number<amdgpu_synid_integer_number>` or an
+:ref:`absolute expression<amdgpu_synid_absolute_expression>`.
+
+Examples:
+
+.. parsed-literal::
+
+ s_waitcnt vmcnt(1)
+ s_waitcnt expcnt(2) lgkmcnt(3)
+ s_waitcnt vmcnt(1), expcnt(2), lgkmcnt(3)
+ s_waitcnt vmcnt(1) & lgkmcnt_sat(100) & expcnt(2)
diff --git a/llvm/docs/AMDGPUModifierSyntax.rst b/llvm/docs/AMDGPUModifierSyntax.rst
index 334bdaf..8a60663 100644
--- a/llvm/docs/AMDGPUModifierSyntax.rst
+++ b/llvm/docs/AMDGPUModifierSyntax.rst
@@ -1078,6 +1078,73 @@ Examples:
offset:0xfffff
offset:-x
+.. _amdgpu_synid_smem_offset24s:
+
+offset24s
+~~~~~~~~~
+
+Specifies a signed 24-bit offset, in bytes. The default value is 0.
+
+ ============================= ====================================================================
+ Syntax Description
+ ============================= ====================================================================
+ offset:{-0x1000000..0xFFFFFF} Specifies an offset as an
+ :ref:`integer number <amdgpu_synid_integer_number>`
+ or an :ref:`absolute expression<amdgpu_synid_absolute_expression>`.
+ ============================= ====================================================================
+
+Examples:
+
+.. parsed-literal::
+
+ offset:-1
+ offset:0xfffff
+ offset:-x
+
+.. _amdgpu_synid_th:
+
+th
+~~
+
+Specifies temporal hint of memory operation.
+
+ =============================== =========================================================
+ Syntax Description
+ =============================== =========================================================
+ TH_{LOAD|STORE}_RT Regular
+ TH_{LOAD|STORE}_NT Non-temporal
+ TH_{LOAD|STORE}_HT High-temporal
+ TH_{LOAD|STORE}_LU Last use. Not available in SYS scope.
+ TH_{LOAD|STORE}_WB Regular (CU, SE); High-temporal with write-back (MALL)
+ TH_{LOAD|STORE}_NT_RT Non-temporal (CU, SE); Regular (MALL)
+ TH_{LOAD|STORE}_RT_NT Regular (CU, SE); Non-temporal (MALL)
+ TH_{LOAD|STORE}_NT_HT Non-temporal (CU, SE); High-temporal (MALL)
+ TH_{LOAD|STORE}_NT_WB Non-temporal (CU, SE); High-temporal with write-back (MALL)
+ TH_{LOAD|STORE}_BYPASS Available for SYS scope only.
+ TH_ATOMIC_RT Regular
+ TH_ATOMIC_RT_RETURN Regular. For atomic instructions that return values.
+ TH_ATOMIC_NT Non-temporal
+ TH_ATOMIC_NT_RETURN Non-temporal. For atomic instructions that return values.
+ TH_ATOMIC_CASCADE_RT Cascading atomic; Regular.
+ TH_ATOMIC_CASCADE_NT Cascading atomic; Non-temporal.
+ =============================== =========================================================
+
+.. _amdgpu_synid_scope:
+
+scope
+~~~~~
+
+Specifies scope of memory operation.
+
+ =============================== =========================================================
+ Syntax Description
+ =============================== =========================================================
+ SCOPE_CU Coherency within a Compute Unit.
+ SCOPE_SE Coherency within a Shader Engine.
+ SCOPE_DEV Coherency within a single device.
+ SCOPE_SYS Coherency across the full system.
+ =============================== =========================================================
+
VINTRP/VINTERP/LDSDIR Modifiers
-------------------------------
@@ -1117,6 +1184,27 @@ The default value is zero. This is a safe value, but it may be suboptimal.
issuing this instruction.
================ ======================================================
+.. _amdgpu_synid_wait_va_vdst:
+
+wait_va_vdst
+~~~~~~~~~~~~
+
+Manually specify a wait on the VA_VDST counter before issuing this instruction. VA_VDST must be less
+than or equal to this value before the instruction is issued. If set to 15, no wait is performed.
+
+If unspecified the current default is zero. This is a safe value but may have poor performance characteristics.
+
+This modifier is a shorthand for the WAR hazard where VALU reads a VGPR that is written by a parameter
+load. Since there is no VA_VSRC counter we must use VA_VDST as a proxy to detect when the
+VALU instruction has completed:
+
+Examples:
+
+.. parsed-literal::
+
+ v_mov_b32 v1, v0
+ ds_param_load v0, . . . wait_va_vdst:0
+
.. _amdgpu_synid_wait_vdst:
wait_vdst
@@ -1135,6 +1223,27 @@ The default value is zero. This is a safe value, but it may be suboptimal.
issuing this instruction.
================== ======================================================
+.. _amdgpu_synid_wait_vm_vsrc:
+
+wait_vm_vsrc
+~~~~~~~~~~~~
+
+Manually specify a wait on the VM_VSRC counter before issuing this instruction. VM_VSRC must be less
+than or equal to this value before the instruction is issued. If set to 1, no wait is performed.
+
+If unspecified the current default is zero. This is a safe value but may have poor performance characteristics.
+
+This modifier is a shorthand for the WAR hazard where VMEM reads a VGPR that is written by a parameter
+load.
+
+Examples:
+
+.. parsed-literal::
+
+ buffer_load_b32 v1, v0, s0, 0
+ ds_param_load v0, . . . wait_vm_vsrc:0
+
+
DPP8 Modifiers
--------------
diff --git a/llvm/docs/AMDGPUOperandSyntax.rst b/llvm/docs/AMDGPUOperandSyntax.rst
index e8a7632..722290f 100644
--- a/llvm/docs/AMDGPUOperandSyntax.rst
+++ b/llvm/docs/AMDGPUOperandSyntax.rst
@@ -479,6 +479,7 @@ High and low 32 bits of *xnack mask* may be accessed as separate registers:
.. _amdgpu_synid_vcc:
.. _amdgpu_synid_vcc_lo:
+.. _amdgpu_synid_vcc_hi:
vcc
---
@@ -523,6 +524,8 @@ including register indexing and bounds checking.
=========== ===================================================
.. _amdgpu_synid_exec:
+.. _amdgpu_synid_exec_lo:
+.. _amdgpu_synid_exec_hi:
exec
----
@@ -752,6 +755,14 @@ or an :ref:`absolute expression<amdgpu_synid_absolute_expression>`.
The value must be in the range -0x100000..0x0FFFFF.
+.. _amdgpu_synid_simm8:
+
+simm8
+-----
+
+An 8-bit :ref:`integer number<amdgpu_synid_integer_number>`
+or an :ref:`absolute expression<amdgpu_synid_absolute_expression>`.
+
.. _amdgpu_synid_off:
off
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 74b7604..a4d110f 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -22,6 +22,7 @@ User Guide for AMDGPU Backend
AMDGPU/AMDGPUAsmGFX1013
AMDGPU/AMDGPUAsmGFX1030
AMDGPU/AMDGPUAsmGFX11
+ AMDGPU/AMDGPUAsmGFX12
AMDGPUModifierSyntax
AMDGPUOperandSyntax
AMDGPUInstructionSyntax
@@ -19908,6 +19909,7 @@ in this description.
:doc:`gfx1102<AMDGPU/AMDGPUAsmGFX11>`
:doc:`gfx1103<AMDGPU/AMDGPUAsmGFX11>`
+ RDNA 4 :doc:`GFX12<AMDGPU/AMDGPUAsmGFX12>` :doc:`gfx1200<AMDGPU/AMDGPUAsmGFX12>`
============= ============================================= =======================================
For more information about instructions, their semantics and supported
diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index f7e1374..4b4b09a 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -213,6 +213,16 @@ what to add to your calendar invite.
- `ics <https://calendar.google.com/calendar/ical/c_fe5774fa2769c5085d6b87e8fac272e8940e7d0089bc0e0a58dc3ead7978504b%40group.calendar.google.com/public/basic.ics>`__
`gcal <https://calendar.google.com/calendar/embed?src=c_fe5774fa2769c5085d6b87e8fac272e8940e7d0089bc0e0a58dc3ead7978504b%40group.calendar.google.com&ctz=Asia%2FTokyo>`__
- `Minutes/docs <https://discourse.llvm.org/t/llvm-qualification-wg-sync-ups-meeting-minutes/87148>`__
+ * - MLIR C/C++ Frontend Working Group
+ - Monthly, usually 1st Monday of the month
+ - `ics <https://calendar.google.com/calendar/ical/jvceakm3kbpku3f4jrsv1lkigo%40group.calendar.google.com/public/basic.ics>`__
+ `gcal <https://calendar.google.com/calendar/embed?src=jvceakm3kbpku3f4jrsv1lkigo%40group.calendar.google.com&ctz=America%2FLos_Angeles>`__
+ - `Minutes/docs <https://docs.google.com/document/d/1-flHK3TjQUrkSO2Fdt4webZ2zCyeXxpTLMiRQbMW7hE>`__
+ * - ClangIR Upstreaming Coordination Meeting
+ - Every 2 weeks on Mondays
+ - `ics <https://calendar.google.com/calendar/ical/c_673c6cd64474c0aff173bf8fa609559f93d654e0984d9d91d71abd32d28c0486%40group.calendar.google.com/public/basic.ics>`__
+ `gcal <https://calendar.google.com/calendar/embed?src=c_673c6cd64474c0aff173bf8fa609559f93d654e0984d9d91d71abd32d28c0486%40group.calendar.google.com&ctz=America%2FLos_Angeles>`__
+ -
For event owners, our Discord bot also supports sending automated announcements
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index 3671c1c..b7c3015 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -36,6 +36,7 @@
#define LLVM_ANALYSIS_IR2VEC_H
#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/CommandLine.h"
@@ -44,6 +45,7 @@
#include "llvm/Support/JSON.h"
#include <array>
#include <map>
+#include <optional>
namespace llvm {
@@ -143,6 +145,73 @@ public:
using InstEmbeddingsMap = DenseMap<const Instruction *, Embedding>;
using BBEmbeddingsMap = DenseMap<const BasicBlock *, Embedding>;
+/// Generic storage class for section-based vocabularies.
+/// VocabStorage provides a generic foundation for storing and accessing
+/// embeddings organized into sections.
+class VocabStorage {
+private:
+ /// Section-based storage
+ std::vector<std::vector<Embedding>> Sections;
+
+ const size_t TotalSize;
+ const unsigned Dimension;
+
+public:
+ /// Default constructor creates empty storage (invalid state)
+ VocabStorage() : Sections(), TotalSize(0), Dimension(0) {}
+
+ /// Create a VocabStorage with pre-organized section data
+ VocabStorage(std::vector<std::vector<Embedding>> &&SectionData);
+
+ VocabStorage(VocabStorage &&) = default;
+ VocabStorage &operator=(VocabStorage &&) = delete;
+
+ VocabStorage(const VocabStorage &) = delete;
+ VocabStorage &operator=(const VocabStorage &) = delete;
+
+ /// Get total number of entries across all sections
+ size_t size() const { return TotalSize; }
+
+ /// Get number of sections
+ unsigned getNumSections() const {
+ return static_cast<unsigned>(Sections.size());
+ }
+
+ /// Section-based access: Storage[sectionId][localIndex]
+ const std::vector<Embedding> &operator[](unsigned SectionId) const {
+ assert(SectionId < Sections.size() && "Invalid section ID");
+ return Sections[SectionId];
+ }
+
+ /// Get vocabulary dimension
+ unsigned getDimension() const { return Dimension; }
+
+ /// Check if vocabulary is valid (has data)
+ bool isValid() const { return TotalSize > 0; }
+
+ /// Iterator support for section-based access
+ class const_iterator {
+ const VocabStorage *Storage;
+ unsigned SectionId = 0;
+ size_t LocalIndex = 0;
+
+ public:
+ const_iterator(const VocabStorage *Storage, unsigned SectionId,
+ size_t LocalIndex)
+ : Storage(Storage), SectionId(SectionId), LocalIndex(LocalIndex) {}
+
+ LLVM_ABI const Embedding &operator*() const;
+ LLVM_ABI const_iterator &operator++();
+ LLVM_ABI bool operator==(const const_iterator &Other) const;
+ LLVM_ABI bool operator!=(const const_iterator &Other) const;
+ };
+
+ const_iterator begin() const { return const_iterator(this, 0, 0); }
+ const_iterator end() const {
+ return const_iterator(this, getNumSections(), 0);
+ }
+};
+
/// Class for storing and accessing the IR2Vec vocabulary.
/// The Vocabulary class manages seed embeddings for LLVM IR entities. The
/// seed embeddings are the initial learned representations of the entities
@@ -162,15 +231,42 @@ using BBEmbeddingsMap = DenseMap<const BasicBlock *, Embedding>;
/// embeddings.
class Vocabulary {
friend class llvm::IR2VecVocabAnalysis;
- using VocabVector = std::vector<ir2vec::Embedding>;
- VocabVector Vocab;
-public:
- // Slot layout:
- // [0 .. MaxOpcodes-1] => Instruction opcodes
- // [MaxOpcodes .. MaxOpcodes+MaxCanonicalTypeIDs-1] => Canonicalized types
- // [MaxOpcodes+MaxCanonicalTypeIDs .. NumCanonicalEntries-1] => Operand kinds
+ // Vocabulary Layout:
+ // +----------------+------------------------------------------------------+
+ // | Entity Type | Index Range |
+ // +----------------+------------------------------------------------------+
+ // | Opcodes | [0 .. (MaxOpcodes-1)] |
+ // | Canonical Types| [MaxOpcodes .. (MaxOpcodes+MaxCanonicalTypeIDs-1)] |
+ // | Operands | [(MaxOpcodes+MaxCanonicalTypeIDs) .. NumCanEntries] |
+ // +----------------+------------------------------------------------------+
+ // Note: MaxOpcodes is the number of unique opcodes supported by LLVM IR.
+ // MaxCanonicalTypeIDs is the number of canonicalized type IDs.
+ // "Similar" LLVM Types are grouped/canonicalized together. E.g., all
+ // float variants (FloatTy, DoubleTy, HalfTy, etc.) map to
+ // CanonicalTypeID::FloatTy. This helps reduce the vocabulary size
+ // and improves learning. Operands include Comparison predicates
+ // (ICmp/FCmp) along with other operand types. This can be extended to
+ // include other specializations in future.
+ enum class Section : unsigned {
+ Opcodes = 0,
+ CanonicalTypes = 1,
+ Operands = 2,
+ Predicates = 3,
+ MaxSections
+ };
+
+ // Use section-based storage for better organization and efficiency
+ VocabStorage Storage;
+
+ static constexpr unsigned NumICmpPredicates =
+ static_cast<unsigned>(CmpInst::LAST_ICMP_PREDICATE) -
+ static_cast<unsigned>(CmpInst::FIRST_ICMP_PREDICATE) + 1;
+ static constexpr unsigned NumFCmpPredicates =
+ static_cast<unsigned>(CmpInst::LAST_FCMP_PREDICATE) -
+ static_cast<unsigned>(CmpInst::FIRST_FCMP_PREDICATE) + 1;
+public:
/// Canonical type IDs supported by IR2Vec Vocabulary
enum class CanonicalTypeID : unsigned {
FloatTy,
@@ -207,59 +303,114 @@ public:
static_cast<unsigned>(CanonicalTypeID::MaxCanonicalType);
static constexpr unsigned MaxOperandKinds =
static_cast<unsigned>(OperandKind::MaxOperandKind);
+ // CmpInst::Predicate has gaps. We want the vocabulary to be dense without
+ // empty slots.
+ static constexpr unsigned MaxPredicateKinds =
+ NumICmpPredicates + NumFCmpPredicates;
Vocabulary() = default;
- LLVM_ABI Vocabulary(VocabVector &&Vocab) : Vocab(std::move(Vocab)) {}
+ LLVM_ABI Vocabulary(VocabStorage &&Storage) : Storage(std::move(Storage)) {}
+
+ Vocabulary(const Vocabulary &) = delete;
+ Vocabulary &operator=(const Vocabulary &) = delete;
+
+ Vocabulary(Vocabulary &&) = default;
+ Vocabulary &operator=(Vocabulary &&Other) = delete;
+
+ LLVM_ABI bool isValid() const {
+ return Storage.size() == NumCanonicalEntries;
+ }
+
+ LLVM_ABI unsigned getDimension() const {
+ assert(isValid() && "IR2Vec Vocabulary is invalid");
+ return Storage.getDimension();
+ }
- LLVM_ABI bool isValid() const { return Vocab.size() == NumCanonicalEntries; };
- LLVM_ABI unsigned getDimension() const;
- /// Total number of entries (opcodes + canonicalized types + operand kinds)
+ /// Total number of entries (opcodes + canonicalized types + operand kinds +
+ /// predicates)
static constexpr size_t getCanonicalSize() { return NumCanonicalEntries; }
/// Function to get vocabulary key for a given Opcode
LLVM_ABI static StringRef getVocabKeyForOpcode(unsigned Opcode);
/// Function to get vocabulary key for a given TypeID
- LLVM_ABI static StringRef getVocabKeyForTypeID(Type::TypeID TypeID);
+ LLVM_ABI static StringRef getVocabKeyForTypeID(Type::TypeID TypeID) {
+ return getVocabKeyForCanonicalTypeID(getCanonicalTypeID(TypeID));
+ }
/// Function to get vocabulary key for a given OperandKind
- LLVM_ABI static StringRef getVocabKeyForOperandKind(OperandKind Kind);
+ LLVM_ABI static StringRef getVocabKeyForOperandKind(OperandKind Kind) {
+ unsigned Index = static_cast<unsigned>(Kind);
+ assert(Index < MaxOperandKinds && "Invalid OperandKind");
+ return OperandKindNames[Index];
+ }
/// Function to classify an operand into OperandKind
LLVM_ABI static OperandKind getOperandKind(const Value *Op);
- /// Functions to return the slot index or position of a given Opcode, TypeID,
- /// or OperandKind in the vocabulary.
- LLVM_ABI static unsigned getSlotIndex(unsigned Opcode);
- LLVM_ABI static unsigned getSlotIndex(Type::TypeID TypeID);
- LLVM_ABI static unsigned getSlotIndex(const Value &Op);
+ /// Function to get vocabulary key for a given predicate
+ LLVM_ABI static StringRef getVocabKeyForPredicate(CmpInst::Predicate P);
+
+ /// Functions to return flat index
+ LLVM_ABI static unsigned getIndex(unsigned Opcode) {
+ assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
+ return Opcode - 1; // Convert to zero-based index
+ }
+
+ LLVM_ABI static unsigned getIndex(Type::TypeID TypeID) {
+ assert(static_cast<unsigned>(TypeID) < MaxTypeIDs && "Invalid type ID");
+ return MaxOpcodes + static_cast<unsigned>(getCanonicalTypeID(TypeID));
+ }
+
+ LLVM_ABI static unsigned getIndex(const Value &Op) {
+ unsigned Index = static_cast<unsigned>(getOperandKind(&Op));
+ assert(Index < MaxOperandKinds && "Invalid OperandKind");
+ return OperandBaseOffset + Index;
+ }
+
+ LLVM_ABI static unsigned getIndex(CmpInst::Predicate P) {
+ return PredicateBaseOffset + getPredicateLocalIndex(P);
+ }
/// Accessors to get the embedding for a given entity.
- LLVM_ABI const ir2vec::Embedding &operator[](unsigned Opcode) const;
- LLVM_ABI const ir2vec::Embedding &operator[](Type::TypeID TypeId) const;
- LLVM_ABI const ir2vec::Embedding &operator[](const Value &Arg) const;
+ LLVM_ABI const ir2vec::Embedding &operator[](unsigned Opcode) const {
+ assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
+ return Storage[static_cast<unsigned>(Section::Opcodes)][Opcode - 1];
+ }
+
+ LLVM_ABI const ir2vec::Embedding &operator[](Type::TypeID TypeID) const {
+ assert(static_cast<unsigned>(TypeID) < MaxTypeIDs && "Invalid type ID");
+ unsigned LocalIndex = static_cast<unsigned>(getCanonicalTypeID(TypeID));
+ return Storage[static_cast<unsigned>(Section::CanonicalTypes)][LocalIndex];
+ }
+
+ LLVM_ABI const ir2vec::Embedding &operator[](const Value &Arg) const {
+ unsigned LocalIndex = static_cast<unsigned>(getOperandKind(&Arg));
+ assert(LocalIndex < MaxOperandKinds && "Invalid OperandKind");
+ return Storage[static_cast<unsigned>(Section::Operands)][LocalIndex];
+ }
+
+ LLVM_ABI const ir2vec::Embedding &operator[](CmpInst::Predicate P) const {
+ unsigned LocalIndex = getPredicateLocalIndex(P);
+ return Storage[static_cast<unsigned>(Section::Predicates)][LocalIndex];
+ }
/// Const Iterator type aliases
- using const_iterator = VocabVector::const_iterator;
+ using const_iterator = VocabStorage::const_iterator;
+
const_iterator begin() const {
assert(isValid() && "IR2Vec Vocabulary is invalid");
- return Vocab.begin();
+ return Storage.begin();
}
- const_iterator cbegin() const {
- assert(isValid() && "IR2Vec Vocabulary is invalid");
- return Vocab.cbegin();
- }
+ const_iterator cbegin() const { return begin(); }
const_iterator end() const {
assert(isValid() && "IR2Vec Vocabulary is invalid");
- return Vocab.end();
+ return Storage.end();
}
- const_iterator cend() const {
- assert(isValid() && "IR2Vec Vocabulary is invalid");
- return Vocab.cend();
- }
+ const_iterator cend() const { return end(); }
/// Returns the string key for a given index position in the vocabulary.
/// This is useful for debugging or printing the vocabulary. Do not use this
@@ -267,14 +418,24 @@ public:
LLVM_ABI static StringRef getStringKey(unsigned Pos);
/// Create a dummy vocabulary for testing purposes.
- LLVM_ABI static VocabVector createDummyVocabForTest(unsigned Dim = 1);
+ LLVM_ABI static VocabStorage createDummyVocabForTest(unsigned Dim = 1);
LLVM_ABI bool invalidate(Module &M, const PreservedAnalyses &PA,
ModuleAnalysisManager::Invalidator &Inv) const;
private:
constexpr static unsigned NumCanonicalEntries =
- MaxOpcodes + MaxCanonicalTypeIDs + MaxOperandKinds;
+ MaxOpcodes + MaxCanonicalTypeIDs + MaxOperandKinds + MaxPredicateKinds;
+
+ // Base offsets for flat index computation
+ constexpr static unsigned OperandBaseOffset =
+ MaxOpcodes + MaxCanonicalTypeIDs;
+ constexpr static unsigned PredicateBaseOffset =
+ OperandBaseOffset + MaxOperandKinds;
+
+ /// Functions for predicate index calculations
+ static unsigned getPredicateLocalIndex(CmpInst::Predicate P);
+ static CmpInst::Predicate getPredicateFromLocalIndex(unsigned LocalIndex);
/// String mappings for CanonicalTypeID values
static constexpr StringLiteral CanonicalTypeNames[] = {
@@ -322,10 +483,26 @@ private:
/// Function to get vocabulary key for canonical type by enum
LLVM_ABI static StringRef
- getVocabKeyForCanonicalTypeID(CanonicalTypeID CType);
+ getVocabKeyForCanonicalTypeID(CanonicalTypeID CType) {
+ unsigned Index = static_cast<unsigned>(CType);
+ assert(Index < MaxCanonicalTypeIDs && "Invalid CanonicalTypeID");
+ return CanonicalTypeNames[Index];
+ }
/// Function to convert TypeID to CanonicalTypeID
- LLVM_ABI static CanonicalTypeID getCanonicalTypeID(Type::TypeID TypeID);
+ LLVM_ABI static CanonicalTypeID getCanonicalTypeID(Type::TypeID TypeID) {
+ unsigned Index = static_cast<unsigned>(TypeID);
+ assert(Index < MaxTypeIDs && "Invalid TypeID");
+ return TypeIDMapping[Index];
+ }
+
+ /// Function to get the predicate enum value for a given index. Index is
+ /// relative to the predicates section of the vocabulary. E.g., Index 0
+ /// corresponds to the first predicate.
+ LLVM_ABI static CmpInst::Predicate getPredicate(unsigned Index) {
+ assert(Index < MaxPredicateKinds && "Invalid predicate index");
+ return getPredicateFromLocalIndex(Index);
+ }
};
/// Embedder provides the interface to generate embeddings (vector
@@ -418,22 +595,22 @@ public:
/// mapping between an entity of the IR (like opcode, type, argument, etc.) and
/// its corresponding embedding.
class IR2VecVocabAnalysis : public AnalysisInfoMixin<IR2VecVocabAnalysis> {
- using VocabVector = std::vector<ir2vec::Embedding>;
using VocabMap = std::map<std::string, ir2vec::Embedding>;
- VocabMap OpcVocab, TypeVocab, ArgVocab;
- VocabVector Vocab;
+ std::optional<ir2vec::VocabStorage> Vocab;
- Error readVocabulary();
+ Error readVocabulary(VocabMap &OpcVocab, VocabMap &TypeVocab,
+ VocabMap &ArgVocab);
Error parseVocabSection(StringRef Key, const json::Value &ParsedVocabValue,
VocabMap &TargetVocab, unsigned &Dim);
- void generateNumMappedVocab();
+ void generateVocabStorage(VocabMap &OpcVocab, VocabMap &TypeVocab,
+ VocabMap &ArgVocab);
void emitError(Error Err, LLVMContext &Ctx);
public:
LLVM_ABI static AnalysisKey Key;
IR2VecVocabAnalysis() = default;
- LLVM_ABI explicit IR2VecVocabAnalysis(const VocabVector &Vocab);
- LLVM_ABI explicit IR2VecVocabAnalysis(VocabVector &&Vocab);
+ LLVM_ABI explicit IR2VecVocabAnalysis(ir2vec::VocabStorage &&Vocab)
+ : Vocab(std::move(Vocab)) {}
using Result = ir2vec::Vocabulary;
LLVM_ABI Result run(Module &M, ModuleAnalysisManager &MAM);
};
diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h
index 08a7ddb..8944e736 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainer.h
+++ b/llvm/include/llvm/BinaryFormat/DXContainer.h
@@ -844,6 +844,7 @@ struct StaticSampler : public v1::StaticSampler {
enum class RootSignatureVersion {
V1_0 = 0x1,
V1_1 = 0x2,
+ V1_2 = 0x3,
};
} // namespace dxbc
diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h
index 87777fd..edee6a7 100644
--- a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h
+++ b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h
@@ -56,7 +56,8 @@ struct RootDescriptor {
return;
}
- assert(Version == llvm::dxbc::RootSignatureVersion::V1_1 &&
+ assert((Version == llvm::dxbc::RootSignatureVersion::V1_1 ||
+ Version == llvm::dxbc::RootSignatureVersion::V1_2) &&
"Specified an invalid root signature version");
switch (Type) {
case dxil::ResourceClass::CBuffer:
@@ -100,7 +101,8 @@ struct DescriptorTableClause {
return;
}
- assert(Version == dxbc::RootSignatureVersion::V1_1 &&
+ assert((Version == dxbc::RootSignatureVersion::V1_1 ||
+ Version == dxbc::RootSignatureVersion::V1_2) &&
"Specified an invalid root signature version");
switch (Type) {
case dxil::ResourceClass::CBuffer:
@@ -131,6 +133,7 @@ struct StaticSampler {
float MaxLOD = std::numeric_limits<float>::max();
uint32_t Space = 0;
dxbc::ShaderVisibility Visibility = dxbc::ShaderVisibility::All;
+ dxbc::StaticSamplerFlags Flags = dxbc::StaticSamplerFlags::None;
};
/// Models RootElement : RootFlags | RootConstants | RootParam
diff --git a/llvm/include/llvm/Support/FileSystem.h b/llvm/include/llvm/Support/FileSystem.h
index c203779..cf2a810 100644
--- a/llvm/include/llvm/Support/FileSystem.h
+++ b/llvm/include/llvm/Support/FileSystem.h
@@ -268,18 +268,6 @@ public:
/// Make \a path an absolute path.
///
-/// Makes \a path absolute using the \a current_directory if it is not already.
-/// An empty \a path will result in the \a current_directory.
-///
-/// /absolute/path => /absolute/path
-/// relative/../path => <current-directory>/relative/../path
-///
-/// @param path A path that is modified to be an absolute path.
-LLVM_ABI void make_absolute(const Twine &current_directory,
- SmallVectorImpl<char> &path);
-
-/// Make \a path an absolute path.
-///
/// Makes \a path absolute using the current directory if it is not already. An
/// empty \a path will result in the current directory.
///
diff --git a/llvm/include/llvm/Support/Path.h b/llvm/include/llvm/Support/Path.h
index 0cb5171..a8e0f33 100644
--- a/llvm/include/llvm/Support/Path.h
+++ b/llvm/include/llvm/Support/Path.h
@@ -566,6 +566,18 @@ LLVM_ABI bool is_absolute_gnu(const Twine &path, Style style = Style::native);
/// @result True if the path is relative, false if it is not.
LLVM_ABI bool is_relative(const Twine &path, Style style = Style::native);
+/// Make \a path an absolute path.
+///
+/// Makes \a path absolute using the \a current_directory if it is not already.
+/// An empty \a path will result in the \a current_directory.
+///
+/// /absolute/path => /absolute/path
+/// relative/../path => <current-directory>/relative/../path
+///
+/// @param path A path that is modified to be an absolute path.
+LLVM_ABI void make_absolute(const Twine &current_directory,
+ SmallVectorImpl<char> &path);
+
} // end namespace path
} // end namespace sys
} // end namespace llvm
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index bf4e490..d0fd483 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -29,10 +29,10 @@
#include <string>
#include <utility>
-LLVM_ABI extern llvm::cl::opt<bool> NoKernelInfoEndLTO;
-
namespace llvm {
+LLVM_ABI extern llvm::cl::opt<bool> NoKernelInfoEndLTO;
+
class AAManager;
using ModulePassManager = PassManager<Module>;
diff --git a/llvm/lib/Analysis/CtxProfAnalysis.cpp b/llvm/lib/Analysis/CtxProfAnalysis.cpp
index a363bce..c4abec0 100644
--- a/llvm/lib/Analysis/CtxProfAnalysis.cpp
+++ b/llvm/lib/Analysis/CtxProfAnalysis.cpp
@@ -30,6 +30,9 @@
#define DEBUG_TYPE "ctx_prof"
using namespace llvm;
+
+namespace llvm {
+
cl::opt<std::string>
UseCtxProfile("use-ctx-profile", cl::init(""), cl::Hidden,
cl::desc("Use the specified contextual profile file"));
@@ -50,7 +53,6 @@ static cl::opt<bool> ForceIsInSpecializedModule(
const char *AssignGUIDPass::GUIDMetadataName = "guid";
-namespace llvm {
class ProfileAnnotatorImpl final {
friend class ProfileAnnotator;
class BBInfo;
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 99afc06..271f004 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -15,6 +15,7 @@
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Module.h"
@@ -216,6 +217,8 @@ void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const {
ArgEmb += Vocab[*Op];
auto InstVector =
Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
+ if (const auto *IC = dyn_cast<CmpInst>(&I))
+ InstVector += Vocab[IC->getPredicate()];
InstVecMap[&I] = InstVector;
BBVector += InstVector;
}
@@ -250,6 +253,9 @@ void FlowAwareEmbedder::computeEmbeddings(const BasicBlock &BB) const {
// embeddings
auto InstVector =
Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
+ // Add compare predicate embedding as an additional operand if applicable
+ if (const auto *IC = dyn_cast<CmpInst>(&I))
+ InstVector += Vocab[IC->getPredicate()];
InstVecMap[&I] = InstVector;
BBVector += InstVector;
}
@@ -257,41 +263,75 @@ void FlowAwareEmbedder::computeEmbeddings(const BasicBlock &BB) const {
}
// ==----------------------------------------------------------------------===//
-// Vocabulary
+// VocabStorage
//===----------------------------------------------------------------------===//
-unsigned Vocabulary::getDimension() const {
- assert(isValid() && "IR2Vec Vocabulary is invalid");
- return Vocab[0].size();
-}
-
-unsigned Vocabulary::getSlotIndex(unsigned Opcode) {
- assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
- return Opcode - 1; // Convert to zero-based index
-}
-
-unsigned Vocabulary::getSlotIndex(Type::TypeID TypeID) {
- assert(static_cast<unsigned>(TypeID) < MaxTypeIDs && "Invalid type ID");
- return MaxOpcodes + static_cast<unsigned>(getCanonicalTypeID(TypeID));
-}
-
-unsigned Vocabulary::getSlotIndex(const Value &Op) {
- unsigned Index = static_cast<unsigned>(getOperandKind(&Op));
- assert(Index < MaxOperandKinds && "Invalid OperandKind");
- return MaxOpcodes + MaxCanonicalTypeIDs + Index;
+VocabStorage::VocabStorage(std::vector<std::vector<Embedding>> &&SectionData)
+ : Sections(std::move(SectionData)), TotalSize([&] {
+ assert(!Sections.empty() && "Vocabulary has no sections");
+ // Compute total size across all sections
+ size_t Size = 0;
+ for (const auto &Section : Sections) {
+ assert(!Section.empty() && "Vocabulary section is empty");
+ Size += Section.size();
+ }
+ return Size;
+ }()),
+ Dimension([&] {
+ // Get dimension from the first embedding in the first section - all
+ // embeddings must have the same dimension
+ assert(!Sections.empty() && "Vocabulary has no sections");
+ assert(!Sections[0].empty() && "First section of vocabulary is empty");
+ unsigned ExpectedDim = static_cast<unsigned>(Sections[0][0].size());
+
+ // Verify that all embeddings across all sections have the same
+ // dimension
+ auto allSameDim = [ExpectedDim](const std::vector<Embedding> &Section) {
+ return std::all_of(Section.begin(), Section.end(),
+ [ExpectedDim](const Embedding &Emb) {
+ return Emb.size() == ExpectedDim;
+ });
+ };
+ assert(std::all_of(Sections.begin(), Sections.end(), allSameDim) &&
+ "All embeddings must have the same dimension");
+
+ return ExpectedDim;
+ }()) {}
+
+const Embedding &VocabStorage::const_iterator::operator*() const {
+ assert(SectionId < Storage->Sections.size() && "Invalid section ID");
+ assert(LocalIndex < Storage->Sections[SectionId].size() &&
+ "Local index out of range");
+ return Storage->Sections[SectionId][LocalIndex];
+}
+
+VocabStorage::const_iterator &VocabStorage::const_iterator::operator++() {
+ ++LocalIndex;
+ // Check if we need to move to the next section
+ if (SectionId < Storage->getNumSections() &&
+ LocalIndex >= Storage->Sections[SectionId].size()) {
+ assert(LocalIndex == Storage->Sections[SectionId].size() &&
+ "Local index should be at the end of the current section");
+ LocalIndex = 0;
+ ++SectionId;
+ }
+ return *this;
}
-const Embedding &Vocabulary::operator[](unsigned Opcode) const {
- return Vocab[getSlotIndex(Opcode)];
+bool VocabStorage::const_iterator::operator==(
+ const const_iterator &Other) const {
+ return Storage == Other.Storage && SectionId == Other.SectionId &&
+ LocalIndex == Other.LocalIndex;
}
-const Embedding &Vocabulary::operator[](Type::TypeID TypeID) const {
- return Vocab[getSlotIndex(TypeID)];
+bool VocabStorage::const_iterator::operator!=(
+ const const_iterator &Other) const {
+ return !(*this == Other);
}
-const ir2vec::Embedding &Vocabulary::operator[](const Value &Arg) const {
- return Vocab[getSlotIndex(Arg)];
-}
+// ==----------------------------------------------------------------------===//
+// Vocabulary
+//===----------------------------------------------------------------------===//
StringRef Vocabulary::getVocabKeyForOpcode(unsigned Opcode) {
assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
@@ -304,29 +344,6 @@ StringRef Vocabulary::getVocabKeyForOpcode(unsigned Opcode) {
return "UnknownOpcode";
}
-StringRef Vocabulary::getVocabKeyForCanonicalTypeID(CanonicalTypeID CType) {
- unsigned Index = static_cast<unsigned>(CType);
- assert(Index < MaxCanonicalTypeIDs && "Invalid CanonicalTypeID");
- return CanonicalTypeNames[Index];
-}
-
-Vocabulary::CanonicalTypeID
-Vocabulary::getCanonicalTypeID(Type::TypeID TypeID) {
- unsigned Index = static_cast<unsigned>(TypeID);
- assert(Index < MaxTypeIDs && "Invalid TypeID");
- return TypeIDMapping[Index];
-}
-
-StringRef Vocabulary::getVocabKeyForTypeID(Type::TypeID TypeID) {
- return getVocabKeyForCanonicalTypeID(getCanonicalTypeID(TypeID));
-}
-
-StringRef Vocabulary::getVocabKeyForOperandKind(Vocabulary::OperandKind Kind) {
- unsigned Index = static_cast<unsigned>(Kind);
- assert(Index < MaxOperandKinds && "Invalid OperandKind");
- return OperandKindNames[Index];
-}
-
// Helper function to classify an operand into OperandKind
Vocabulary::OperandKind Vocabulary::getOperandKind(const Value *Op) {
if (isa<Function>(Op))
@@ -338,18 +355,50 @@ Vocabulary::OperandKind Vocabulary::getOperandKind(const Value *Op) {
return OperandKind::VariableID;
}
+unsigned Vocabulary::getPredicateLocalIndex(CmpInst::Predicate P) {
+ if (P >= CmpInst::FIRST_FCMP_PREDICATE && P <= CmpInst::LAST_FCMP_PREDICATE)
+ return P - CmpInst::FIRST_FCMP_PREDICATE;
+ else
+ return P - CmpInst::FIRST_ICMP_PREDICATE +
+ (CmpInst::LAST_FCMP_PREDICATE - CmpInst::FIRST_FCMP_PREDICATE + 1);
+}
+
+CmpInst::Predicate Vocabulary::getPredicateFromLocalIndex(unsigned LocalIndex) {
+ unsigned fcmpRange =
+ CmpInst::LAST_FCMP_PREDICATE - CmpInst::FIRST_FCMP_PREDICATE + 1;
+ if (LocalIndex < fcmpRange)
+ return static_cast<CmpInst::Predicate>(CmpInst::FIRST_FCMP_PREDICATE +
+ LocalIndex);
+ else
+ return static_cast<CmpInst::Predicate>(CmpInst::FIRST_ICMP_PREDICATE +
+ LocalIndex - fcmpRange);
+}
+
+StringRef Vocabulary::getVocabKeyForPredicate(CmpInst::Predicate Pred) {
+ static SmallString<16> PredNameBuffer;
+ if (Pred < CmpInst::FIRST_ICMP_PREDICATE)
+ PredNameBuffer = "FCMP_";
+ else
+ PredNameBuffer = "ICMP_";
+ PredNameBuffer += CmpInst::getPredicateName(Pred);
+ return PredNameBuffer;
+}
+
StringRef Vocabulary::getStringKey(unsigned Pos) {
assert(Pos < NumCanonicalEntries && "Position out of bounds in vocabulary");
// Opcode
if (Pos < MaxOpcodes)
return getVocabKeyForOpcode(Pos + 1);
// Type
- if (Pos < MaxOpcodes + MaxCanonicalTypeIDs)
+ if (Pos < OperandBaseOffset)
return getVocabKeyForCanonicalTypeID(
static_cast<CanonicalTypeID>(Pos - MaxOpcodes));
// Operand
- return getVocabKeyForOperandKind(
- static_cast<OperandKind>(Pos - MaxOpcodes - MaxCanonicalTypeIDs));
+ if (Pos < PredicateBaseOffset)
+ return getVocabKeyForOperandKind(
+ static_cast<OperandKind>(Pos - OperandBaseOffset));
+ // Predicates
+ return getVocabKeyForPredicate(getPredicate(Pos - PredicateBaseOffset));
}
// For now, assume vocabulary is stable unless explicitly invalidated.
@@ -359,19 +408,51 @@ bool Vocabulary::invalidate(Module &M, const PreservedAnalyses &PA,
return !(PAC.preservedWhenStateless());
}
-Vocabulary::VocabVector Vocabulary::createDummyVocabForTest(unsigned Dim) {
- VocabVector DummyVocab;
- DummyVocab.reserve(NumCanonicalEntries);
+VocabStorage Vocabulary::createDummyVocabForTest(unsigned Dim) {
float DummyVal = 0.1f;
- // Create a dummy vocabulary with entries for all opcodes, types, and
- // operands
- for ([[maybe_unused]] unsigned _ :
- seq(0u, Vocabulary::MaxOpcodes + Vocabulary::MaxCanonicalTypeIDs +
- Vocabulary::MaxOperandKinds)) {
- DummyVocab.push_back(Embedding(Dim, DummyVal));
+
+ // Create sections for opcodes, types, operands, and predicates
+ // Order must match Vocabulary::Section enum
+ std::vector<std::vector<Embedding>> Sections;
+ Sections.reserve(4);
+
+ // Opcodes section
+ std::vector<Embedding> OpcodeSec;
+ OpcodeSec.reserve(MaxOpcodes);
+ for (unsigned I = 0; I < MaxOpcodes; ++I) {
+ OpcodeSec.emplace_back(Dim, DummyVal);
DummyVal += 0.1f;
}
- return DummyVocab;
+ Sections.push_back(std::move(OpcodeSec));
+
+ // Types section
+ std::vector<Embedding> TypeSec;
+ TypeSec.reserve(MaxCanonicalTypeIDs);
+ for (unsigned I = 0; I < MaxCanonicalTypeIDs; ++I) {
+ TypeSec.emplace_back(Dim, DummyVal);
+ DummyVal += 0.1f;
+ }
+ Sections.push_back(std::move(TypeSec));
+
+ // Operands section
+ std::vector<Embedding> OperandSec;
+ OperandSec.reserve(MaxOperandKinds);
+ for (unsigned I = 0; I < MaxOperandKinds; ++I) {
+ OperandSec.emplace_back(Dim, DummyVal);
+ DummyVal += 0.1f;
+ }
+ Sections.push_back(std::move(OperandSec));
+
+ // Predicates section
+ std::vector<Embedding> PredicateSec;
+ PredicateSec.reserve(MaxPredicateKinds);
+ for (unsigned I = 0; I < MaxPredicateKinds; ++I) {
+ PredicateSec.emplace_back(Dim, DummyVal);
+ DummyVal += 0.1f;
+ }
+ Sections.push_back(std::move(PredicateSec));
+
+ return VocabStorage(std::move(Sections));
}
// ==----------------------------------------------------------------------===//
@@ -417,7 +498,9 @@ Error IR2VecVocabAnalysis::parseVocabSection(
// FIXME: Make this optional. We can avoid file reads
// by auto-generating a default vocabulary during the build time.
-Error IR2VecVocabAnalysis::readVocabulary() {
+Error IR2VecVocabAnalysis::readVocabulary(VocabMap &OpcVocab,
+ VocabMap &TypeVocab,
+ VocabMap &ArgVocab) {
auto BufOrError = MemoryBuffer::getFileOrSTDIN(VocabFile, /*IsText=*/true);
if (!BufOrError)
return createFileError(VocabFile, BufOrError.getError());
@@ -448,7 +531,9 @@ Error IR2VecVocabAnalysis::readVocabulary() {
return Error::success();
}
-void IR2VecVocabAnalysis::generateNumMappedVocab() {
+void IR2VecVocabAnalysis::generateVocabStorage(VocabMap &OpcVocab,
+ VocabMap &TypeVocab,
+ VocabMap &ArgVocab) {
// Helper for handling missing entities in the vocabulary.
// Currently, we use a zero vector. In the future, we will throw an error to
@@ -466,7 +551,6 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
// Handle Opcodes
std::vector<Embedding> NumericOpcodeEmbeddings(Vocabulary::MaxOpcodes,
Embedding(Dim));
- NumericOpcodeEmbeddings.reserve(Vocabulary::MaxOpcodes);
for (unsigned Opcode : seq(0u, Vocabulary::MaxOpcodes)) {
StringRef VocabKey = Vocabulary::getVocabKeyForOpcode(Opcode + 1);
auto It = OpcVocab.find(VocabKey.str());
@@ -475,13 +559,10 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
else
handleMissingEntity(VocabKey.str());
}
- Vocab.insert(Vocab.end(), NumericOpcodeEmbeddings.begin(),
- NumericOpcodeEmbeddings.end());
// Handle Types - only canonical types are present in vocabulary
std::vector<Embedding> NumericTypeEmbeddings(Vocabulary::MaxCanonicalTypeIDs,
Embedding(Dim));
- NumericTypeEmbeddings.reserve(Vocabulary::MaxCanonicalTypeIDs);
for (unsigned CTypeID : seq(0u, Vocabulary::MaxCanonicalTypeIDs)) {
StringRef VocabKey = Vocabulary::getVocabKeyForCanonicalTypeID(
static_cast<Vocabulary::CanonicalTypeID>(CTypeID));
@@ -491,13 +572,10 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
}
handleMissingEntity(VocabKey.str());
}
- Vocab.insert(Vocab.end(), NumericTypeEmbeddings.begin(),
- NumericTypeEmbeddings.end());
// Handle Arguments/Operands
std::vector<Embedding> NumericArgEmbeddings(Vocabulary::MaxOperandKinds,
Embedding(Dim));
- NumericArgEmbeddings.reserve(Vocabulary::MaxOperandKinds);
for (unsigned OpKind : seq(0u, Vocabulary::MaxOperandKinds)) {
Vocabulary::OperandKind Kind = static_cast<Vocabulary::OperandKind>(OpKind);
StringRef VocabKey = Vocabulary::getVocabKeyForOperandKind(Kind);
@@ -508,15 +586,37 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
}
handleMissingEntity(VocabKey.str());
}
- Vocab.insert(Vocab.end(), NumericArgEmbeddings.begin(),
- NumericArgEmbeddings.end());
-}
-IR2VecVocabAnalysis::IR2VecVocabAnalysis(const VocabVector &Vocab)
- : Vocab(Vocab) {}
+ // Handle Predicates: part of Operands section. We look up predicate keys
+ // in ArgVocab.
+ std::vector<Embedding> NumericPredEmbeddings(Vocabulary::MaxPredicateKinds,
+ Embedding(Dim, 0));
+ for (unsigned PK : seq(0u, Vocabulary::MaxPredicateKinds)) {
+ StringRef VocabKey =
+ Vocabulary::getVocabKeyForPredicate(Vocabulary::getPredicate(PK));
+ auto It = ArgVocab.find(VocabKey.str());
+ if (It != ArgVocab.end()) {
+ NumericPredEmbeddings[PK] = It->second;
+ continue;
+ }
+ handleMissingEntity(VocabKey.str());
+ }
+
+ // Create section-based storage instead of flat vocabulary
+ // Order must match Vocabulary::Section enum
+ std::vector<std::vector<Embedding>> Sections(4);
+ Sections[static_cast<unsigned>(Vocabulary::Section::Opcodes)] =
+ std::move(NumericOpcodeEmbeddings); // Section::Opcodes
+ Sections[static_cast<unsigned>(Vocabulary::Section::CanonicalTypes)] =
+ std::move(NumericTypeEmbeddings); // Section::CanonicalTypes
+ Sections[static_cast<unsigned>(Vocabulary::Section::Operands)] =
+ std::move(NumericArgEmbeddings); // Section::Operands
+ Sections[static_cast<unsigned>(Vocabulary::Section::Predicates)] =
+ std::move(NumericPredEmbeddings); // Section::Predicates
-IR2VecVocabAnalysis::IR2VecVocabAnalysis(VocabVector &&Vocab)
- : Vocab(std::move(Vocab)) {}
+ // Create VocabStorage from organized sections
+ Vocab.emplace(std::move(Sections));
+}
void IR2VecVocabAnalysis::emitError(Error Err, LLVMContext &Ctx) {
handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
@@ -528,8 +628,8 @@ IR2VecVocabAnalysis::Result
IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
auto Ctx = &M.getContext();
// If vocabulary is already populated by the constructor, use it.
- if (!Vocab.empty())
- return Vocabulary(std::move(Vocab));
+ if (Vocab.has_value())
+ return Vocabulary(std::move(Vocab.value()));
// Otherwise, try to read from the vocabulary file.
if (VocabFile.empty()) {
@@ -538,7 +638,9 @@ IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
"set it using --ir2vec-vocab-path");
return Vocabulary(); // Return invalid result
}
- if (auto Err = readVocabulary()) {
+
+ VocabMap OpcVocab, TypeVocab, ArgVocab;
+ if (auto Err = readVocabulary(OpcVocab, TypeVocab, ArgVocab)) {
emitError(std::move(Err), *Ctx);
return Vocabulary();
}
@@ -553,9 +655,9 @@ IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
scaleVocabSection(ArgVocab, ArgWeight);
// Generate the numeric lookup vocabulary
- generateNumMappedVocab();
+ generateVocabStorage(OpcVocab, TypeVocab, ArgVocab);
- return Vocabulary(std::move(Vocab));
+ return Vocabulary(std::move(Vocab.value()));
}
// ==----------------------------------------------------------------------===//
@@ -564,7 +666,7 @@ IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
PreservedAnalyses IR2VecPrinterPass::run(Module &M,
ModuleAnalysisManager &MAM) {
- auto Vocabulary = MAM.getResult<IR2VecVocabAnalysis>(M);
+ auto &Vocabulary = MAM.getResult<IR2VecVocabAnalysis>(M);
assert(Vocabulary.isValid() && "IR2Vec Vocabulary is invalid");
for (Function &F : M) {
@@ -606,7 +708,7 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M,
PreservedAnalyses IR2VecVocabPrinterPass::run(Module &M,
ModuleAnalysisManager &MAM) {
- auto IR2VecVocabulary = MAM.getResult<IR2VecVocabAnalysis>(M);
+ auto &IR2VecVocabulary = MAM.getResult<IR2VecVocabAnalysis>(M);
assert(IR2VecVocabulary.isValid() && "IR2Vec Vocabulary is invalid");
// Print each entry
diff --git a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
index 7b93474..25e7a97 100644
--- a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
+++ b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
@@ -22,6 +22,8 @@ using namespace llvm;
#define DEBUG_TYPE "pgo-icall-prom-analysis"
+namespace llvm {
+
// The percent threshold for the direct-call target (this call site vs the
// remaining call count) for it to be considered as the promotion target.
static cl::opt<unsigned> ICPRemainingPercentThreshold(
@@ -54,6 +56,8 @@ cl::opt<unsigned> MaxNumVTableAnnotations(
"icp-max-num-vtables", cl::init(6), cl::Hidden,
cl::desc("Max number of vtables annotated for a vtable load instruction."));
+} // end namespace llvm
+
bool ICallPromotionAnalysis::isPromotionProfitable(uint64_t Count,
uint64_t TotalCount,
uint64_t RemainingCount) {
diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp
index 28b14c2..0fa804f 100644
--- a/llvm/lib/Analysis/InlineAdvisor.cpp
+++ b/llvm/lib/Analysis/InlineAdvisor.cpp
@@ -217,7 +217,7 @@ AnalysisKey PluginInlineAdvisorAnalysis::Key;
bool InlineAdvisorAnalysis::initializeIR2VecVocabIfRequested(
Module &M, ModuleAnalysisManager &MAM) {
if (!IR2VecVocabFile.empty()) {
- auto IR2VecVocabResult = MAM.getResult<IR2VecVocabAnalysis>(M);
+ auto &IR2VecVocabResult = MAM.getResult<IR2VecVocabAnalysis>(M);
if (!IR2VecVocabResult.isValid()) {
M.getContext().emitError("Failed to load IR2Vec vocabulary");
return false;
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index b5ca6b1..11602d2 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -22,6 +22,8 @@ using namespace llvm::memprof;
#define DEBUG_TYPE "memory-profile-info"
+namespace llvm {
+
cl::opt<bool> MemProfReportHintedSizes(
"memprof-report-hinted-sizes", cl::init(false), cl::Hidden,
cl::desc("Report total allocation sizes of hinted allocations"));
@@ -52,6 +54,8 @@ cl::opt<unsigned> MinPercentMaxColdSize(
"memprof-min-percent-max-cold-size", cl::init(100), cl::Hidden,
cl::desc("Min percent of max cold bytes for critical cold context"));
+} // end namespace llvm
+
bool llvm::memprof::metadataIncludesAllContextSizeInfo() {
return MemProfReportHintedSizes || MinClonedColdBytePercent < 100;
}
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index a317ac4..a60a4bb 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -67,7 +67,6 @@ using namespace llvm::memprof;
namespace llvm {
FunctionSummary::ForceSummaryHotnessType ForceSummaryEdgesCold =
FunctionSummary::FSHT_None;
-} // namespace llvm
static cl::opt<FunctionSummary::ForceSummaryHotnessType, true> FSEC(
"force-summary-edges-cold", cl::Hidden, cl::location(ForceSummaryEdgesCold),
@@ -91,6 +90,7 @@ LLVM_ABI extern cl::opt<bool> ScalePartialSampleProfileWorkingSetSize;
extern cl::opt<unsigned> MaxNumVTableAnnotations;
extern cl::opt<bool> MemProfReportHintedSizes;
+} // namespace llvm
// Walk through the operands of a given User via worklist iteration and populate
// the set of GlobalValue references encountered. Invoked either on an
diff --git a/llvm/lib/Analysis/ProfileSummaryInfo.cpp b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
index f1c3155..44d7a17 100644
--- a/llvm/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
@@ -24,6 +24,8 @@
#include <optional>
using namespace llvm;
+namespace llvm {
+
static cl::opt<bool> PartialProfile(
"partial-profile", cl::Hidden, cl::init(false),
cl::desc("Specify the current profile is used as a partial profile."));
@@ -44,6 +46,8 @@ static cl::opt<double> PartialSampleProfileWorkingSetSizeScaleFactor(
"and the factor to scale the working set size to use the same "
"shared thresholds as PGO."));
+} // end namespace llvm
+
// The profile summary metadata may be attached either by the frontend or by
// any backend passes (IR level instrumentation, for example). This method
// checks if the Summary is null and if so checks if the summary metadata is now
diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index b4f08c3..7900dc7 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -31,11 +31,14 @@ static cl::opt<bool>
static cl::opt<std::string>
CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden,
cl::desc("File path to where .cgdata file is read"));
+
+namespace llvm {
cl::opt<bool> CodeGenDataThinLTOTwoRounds(
"codegen-data-thinlto-two-rounds", cl::init(false), cl::Hidden,
cl::desc("Enable two-round ThinLTO code generation. The first round "
"emits codegen data, while the second round uses the emitted "
"codegen data for further optimizations."));
+} // end namespace llvm
static std::string getCGDataErrString(cgdata_error Err,
const std::string &ErrMsg = "") {
diff --git a/llvm/lib/CGData/CodeGenDataReader.cpp b/llvm/lib/CGData/CodeGenDataReader.cpp
index 3fd8cfe..b1cd939 100644
--- a/llvm/lib/CGData/CodeGenDataReader.cpp
+++ b/llvm/lib/CGData/CodeGenDataReader.cpp
@@ -26,14 +26,14 @@ static cl::opt<bool> IndexedCodeGenDataReadFunctionMapNames(
"disabled to save memory and time for final consumption of the "
"indexed CodeGenData in production."));
+namespace llvm {
+
cl::opt<bool> IndexedCodeGenDataLazyLoading(
"indexed-codegen-data-lazy-loading", cl::init(false), cl::Hidden,
cl::desc(
"Lazily load indexed CodeGenData. Enable to save memory and time "
"for final consumption of the indexed CodeGenData in production."));
-namespace llvm {
-
static Expected<std::unique_ptr<MemoryBuffer>>
setupMemoryBuffer(const Twine &Filename, vfs::FileSystem &FS) {
auto BufferOrErr = Filename.str() == "-" ? MemoryBuffer::getSTDIN()
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 701a6a2..11efe49 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -473,11 +473,9 @@ bool AsmPrinter::doInitialization(Module &M) {
AddrLabelSymbols = nullptr;
// Initialize TargetLoweringObjectFile.
- const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
- .Initialize(OutContext, TM);
+ TM.getObjFileLowering()->Initialize(OutContext, TM);
- const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
- .getModuleMetadata(M);
+ TM.getObjFileLowering()->getModuleMetadata(M);
// On AIX, we delay emitting any section information until
// after emitting the .file pseudo-op. This allows additional
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index 477e5c1..c2d474f 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -34,7 +34,7 @@ cl::opt<bool> llvm::DisableGISelLegalityCheck(
cl::desc("Don't verify that MIR is fully legal between GlobalISel passes"),
cl::Hidden);
-cl::opt<bool> VerboseVerifyLegalizerInfo(
+static cl::opt<bool> VerboseVerifyLegalizerInfo(
"verbose-gisel-verify-legalizer-info",
cl::desc("Print more information to dbgs about GlobalISel legalizer rules "
"being verified"),
diff --git a/llvm/lib/CodeGen/MachineRegionInfo.cpp b/llvm/lib/CodeGen/MachineRegionInfo.cpp
index f8268b8..366755a 100644
--- a/llvm/lib/CodeGen/MachineRegionInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegionInfo.cpp
@@ -10,6 +10,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/RegionInfoImpl.h"
#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/Passes.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
@@ -127,7 +128,7 @@ LLVM_DUMP_METHOD void MachineRegionInfoPass::dump() const {
#endif
char MachineRegionInfoPass::ID = 0;
-char &MachineRegionInfoPassID = MachineRegionInfoPass::ID;
+char &llvm::MachineRegionInfoPassID = MachineRegionInfoPass::ID;
INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, DEBUG_TYPE,
"Detect single entry single exit regions", true, true)
diff --git a/llvm/lib/CodeGen/RegAllocScore.cpp b/llvm/lib/CodeGen/RegAllocScore.cpp
index 9c9cc1f..280946b 100644
--- a/llvm/lib/CodeGen/RegAllocScore.cpp
+++ b/llvm/lib/CodeGen/RegAllocScore.cpp
@@ -23,6 +23,8 @@
#include "llvm/Support/CommandLine.h"
using namespace llvm;
+
+namespace llvm {
LLVM_ABI cl::opt<double> CopyWeight("regalloc-copy-weight", cl::init(0.2),
cl::Hidden);
LLVM_ABI cl::opt<double> LoadWeight("regalloc-load-weight", cl::init(4.0),
@@ -33,6 +35,8 @@ LLVM_ABI cl::opt<double> CheapRematWeight("regalloc-cheap-remat-weight",
cl::init(0.2), cl::Hidden);
LLVM_ABI cl::opt<double> ExpensiveRematWeight("regalloc-expensive-remat-weight",
cl::init(1.0), cl::Hidden);
+} // end namespace llvm
+
#define DEBUG_TYPE "regalloc-score"
RegAllocScore &RegAllocScore::operator+=(const RegAllocScore &Other) {
diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp
index c1017d8..d973a47 100644
--- a/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp
@@ -148,7 +148,7 @@ std::error_code LVSplitContext::open(std::string ContextName,
return std::error_code();
}
-LVReader *CurrentReader = nullptr;
+static LVReader *CurrentReader = nullptr;
LVReader &LVReader::getInstance() {
if (CurrentReader)
return *CurrentReader;
diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
index 92c62b8..2b33e56 100644
--- a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
@@ -113,6 +113,13 @@ static raw_ostream &operator<<(raw_ostream &OS,
return OS;
}
+static raw_ostream &operator<<(raw_ostream &OS,
+ const llvm::dxbc::StaticSamplerFlags &Flags) {
+ printFlags(OS, Flags, dxbc::getStaticSamplerFlags());
+
+ return OS;
+}
+
raw_ostream &operator<<(raw_ostream &OS, const dxbc::RootFlags &Flags) {
OS << "RootFlags(";
printFlags(OS, Flags, dxbc::getRootFlags());
@@ -172,7 +179,7 @@ raw_ostream &operator<<(raw_ostream &OS, const StaticSampler &Sampler) {
<< ", borderColor = " << Sampler.BorderColor
<< ", minLOD = " << Sampler.MinLOD << ", maxLOD = " << Sampler.MaxLOD
<< ", space = " << Sampler.Space << ", visibility = " << Sampler.Visibility
- << ")";
+ << ", flags = " << Sampler.Flags << ")";
return OS;
}
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
index 5785505..7a0cf40 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
@@ -218,6 +218,7 @@ MDNode *MetadataBuilder::BuildStaticSampler(const StaticSampler &Sampler) {
ConstantAsMetadata::get(Builder.getInt32(Sampler.Space)),
ConstantAsMetadata::get(
Builder.getInt32(to_underlying(Sampler.Visibility))),
+ ConstantAsMetadata::get(Builder.getInt32(to_underlying(Sampler.Flags))),
};
return MDNode::get(Ctx, Operands);
}
@@ -417,7 +418,7 @@ Error MetadataParser::parseDescriptorTable(mcdxbc::RootSignatureDesc &RSD,
Error MetadataParser::parseStaticSampler(mcdxbc::RootSignatureDesc &RSD,
MDNode *StaticSamplerNode) {
- if (StaticSamplerNode->getNumOperands() != 14)
+ if (StaticSamplerNode->getNumOperands() != 15)
return make_error<InvalidRSMetadataFormat>("Static Sampler");
mcdxbc::StaticSampler Sampler;
@@ -501,6 +502,17 @@ Error MetadataParser::parseStaticSampler(mcdxbc::RootSignatureDesc &RSD,
return Error(std::move(E));
Sampler.ShaderVisibility = *Visibility;
+ if (RSD.Version < 3) {
+ RSD.StaticSamplers.push_back(Sampler);
+ return Error::success();
+ }
+ assert(RSD.Version >= 3);
+
+ if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 14))
+ Sampler.Flags = *Val;
+ else
+ return make_error<InvalidRSMetadataValue>("Static Sampler Flags");
+
RSD.StaticSamplers.push_back(Sampler);
return Error::success();
}
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
index 2c78d62..8a2b03d 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
@@ -40,7 +40,7 @@ bool verifyRootDescriptorFlag(uint32_t Version, uint32_t FlagsVal) {
if (Version == 1)
return Flags == FlagT::DataVolatile;
- assert(Version == 2 && "Provided invalid root signature version");
+ assert((Version <= 3) && "Provided invalid root signature version");
// The data-specific flags are mutually exclusive.
FlagT DataFlags = FlagT::DataVolatile | FlagT::DataStatic |
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index a8bb34f..33ca46c 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -30,6 +30,8 @@
#include "llvm/Support/Compiler.h"
using namespace llvm;
+namespace llvm {
+
// FIXME: Flag used for an ablation performance test, Issue #147390. Placing it
// here because referencing IR should be feasible from anywhere. Will be
// removed after the ablation test.
@@ -38,6 +40,8 @@ cl::opt<bool> ProfcheckDisableMetadataFixes(
cl::desc(
"Disable metadata propagation fixes discovered through Issue #147390"));
+} // end namespace llvm
+
InsertPosition::InsertPosition(Instruction *InsertBefore)
: InsertAt(InsertBefore ? InsertBefore->getIterator()
: InstListType::iterator()) {}
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index e5e062d..a347609 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -36,7 +36,7 @@
using namespace llvm;
-cl::opt<bool> UseDerefAtPointSemantics(
+static cl::opt<bool> UseDerefAtPointSemantics(
"use-dereferenceable-at-point-semantics", cl::Hidden, cl::init(false),
cl::desc("Deref attributes and metadata infer facts at definition only"));
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 7b25262..e6544f3 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -75,9 +75,10 @@ static cl::opt<bool>
DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden,
cl::desc("Dump the SCCs in the ThinLTO index's callgraph"));
+namespace llvm {
extern cl::opt<bool> CodeGenDataThinLTOTwoRounds;
-
extern cl::opt<bool> ForceImportAll;
+} // end namespace llvm
namespace llvm {
/// Enable global value internalization in LTO.
diff --git a/llvm/lib/Object/OffloadBundle.cpp b/llvm/lib/Object/OffloadBundle.cpp
index 0dd378e..329dcbf 100644
--- a/llvm/lib/Object/OffloadBundle.cpp
+++ b/llvm/lib/Object/OffloadBundle.cpp
@@ -120,14 +120,15 @@ OffloadBundleFatBin::create(MemoryBufferRef Buf, uint64_t SectionOffset,
if (identify_magic(Buf.getBuffer()) != file_magic::offload_bundle)
return errorCodeToError(object_error::parse_failed);
- OffloadBundleFatBin *TheBundle = new OffloadBundleFatBin(Buf, FileName);
+ std::unique_ptr<OffloadBundleFatBin> TheBundle(
+ new OffloadBundleFatBin(Buf, FileName));
// Read the Bundle Entries
Error Err = TheBundle->readEntries(Buf.getBuffer(), SectionOffset);
if (Err)
return Err;
- return std::unique_ptr<OffloadBundleFatBin>(TheBundle);
+ return std::move(TheBundle);
}
Error OffloadBundleFatBin::extractBundle(const ObjectFile &Source) {
diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
index 3c09ae4..5dff9ba 100644
--- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp
+++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
@@ -154,7 +154,7 @@ DXContainerYAML::RootSignatureYamlDesc::create(
if (Error E = readDescriptorRanges<dxbc::RTS0::v1::DescriptorRange>(
Header, RootSigDesc, DTV))
return std::move(E);
- } else if (Version == 2) {
+ } else if (Version == 2 || Version == 3) {
if (Error E = readDescriptorRanges<dxbc::RTS0::v2::DescriptorRange>(
Header, RootSigDesc, DTV))
return std::move(E);
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 256cf9d..373b3c3 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -150,6 +150,8 @@
using namespace llvm;
+namespace llvm {
+
static cl::opt<InliningAdvisorMode> UseInlineAdvisor(
"enable-ml-inliner", cl::init(InliningAdvisorMode::Default), cl::Hidden,
cl::desc("Enable ML policy for inliner. Currently trained for -Oz only"),
@@ -305,7 +307,6 @@ static cl::opt<std::string> InstrumentColdFuncOnlyPath(
extern cl::opt<std::string> UseCtxProfile;
extern cl::opt<bool> PGOInstrumentColdFunctionOnly;
-namespace llvm {
extern cl::opt<bool> EnableMemProfContextDisambiguation;
} // namespace llvm
diff --git a/llvm/lib/ProfileData/MemProfCommon.cpp b/llvm/lib/ProfileData/MemProfCommon.cpp
index a13a291..cfd2efd 100644
--- a/llvm/lib/ProfileData/MemProfCommon.cpp
+++ b/llvm/lib/ProfileData/MemProfCommon.cpp
@@ -20,6 +20,8 @@
using namespace llvm;
using namespace llvm::memprof;
+namespace llvm {
+
// Upper bound on lifetime access density (accesses per byte per lifetime sec)
// for marking an allocation cold.
LLVM_ABI cl::opt<float> MemProfLifetimeAccessDensityColdThreshold(
@@ -48,6 +50,8 @@ LLVM_ABI cl::opt<bool>
cl::desc("Enable use of hot hints (only supported for "
"unambigously hot allocations)"));
+} // end namespace llvm
+
AllocationType llvm::memprof::getAllocType(uint64_t TotalLifetimeAccessDensity,
uint64_t AllocCount,
uint64_t TotalLifetime) {
diff --git a/llvm/lib/Support/Path.cpp b/llvm/lib/Support/Path.cpp
index 761d29e..3e06666 100644
--- a/llvm/lib/Support/Path.cpp
+++ b/llvm/lib/Support/Path.cpp
@@ -700,6 +700,55 @@ bool is_relative(const Twine &path, Style style) {
return !is_absolute(path, style);
}
+void make_absolute(const Twine &current_directory,
+ SmallVectorImpl<char> &path) {
+ StringRef p(path.data(), path.size());
+
+ bool rootDirectory = has_root_directory(p);
+ bool rootName = has_root_name(p);
+
+ // Already absolute.
+ if ((rootName || is_style_posix(Style::native)) && rootDirectory)
+ return;
+
+ // All the following conditions will need the current directory.
+ SmallString<128> current_dir;
+ current_directory.toVector(current_dir);
+
+ // Relative path. Prepend the current directory.
+ if (!rootName && !rootDirectory) {
+ // Append path to the current directory.
+ append(current_dir, p);
+ // Set path to the result.
+ path.swap(current_dir);
+ return;
+ }
+
+ if (!rootName && rootDirectory) {
+ StringRef cdrn = root_name(current_dir);
+ SmallString<128> curDirRootName(cdrn.begin(), cdrn.end());
+ append(curDirRootName, p);
+ // Set path to the result.
+ path.swap(curDirRootName);
+ return;
+ }
+
+ if (rootName && !rootDirectory) {
+ StringRef pRootName = root_name(p);
+ StringRef bRootDirectory = root_directory(current_dir);
+ StringRef bRelativePath = relative_path(current_dir);
+ StringRef pRelativePath = relative_path(p);
+
+ SmallString<128> res;
+ append(res, pRootName, bRootDirectory, bRelativePath, pRelativePath);
+ path.swap(res);
+ return;
+ }
+
+ llvm_unreachable("All rootName and rootDirectory combinations should have "
+ "occurred above!");
+}
+
StringRef remove_leading_dotslash(StringRef Path, Style style) {
// Remove leading "./" (or ".//" or "././" etc.)
while (Path.size() > 2 && Path[0] == '.' && is_separator(Path[1], style)) {
@@ -903,55 +952,6 @@ getPotentiallyUniqueTempFileName(const Twine &Prefix, StringRef Suffix,
return createTemporaryFile(Prefix, Suffix, Dummy, ResultPath, FS_Name);
}
-void make_absolute(const Twine &current_directory,
- SmallVectorImpl<char> &path) {
- StringRef p(path.data(), path.size());
-
- bool rootDirectory = path::has_root_directory(p);
- bool rootName = path::has_root_name(p);
-
- // Already absolute.
- if ((rootName || is_style_posix(Style::native)) && rootDirectory)
- return;
-
- // All of the following conditions will need the current directory.
- SmallString<128> current_dir;
- current_directory.toVector(current_dir);
-
- // Relative path. Prepend the current directory.
- if (!rootName && !rootDirectory) {
- // Append path to the current directory.
- path::append(current_dir, p);
- // Set path to the result.
- path.swap(current_dir);
- return;
- }
-
- if (!rootName && rootDirectory) {
- StringRef cdrn = path::root_name(current_dir);
- SmallString<128> curDirRootName(cdrn.begin(), cdrn.end());
- path::append(curDirRootName, p);
- // Set path to the result.
- path.swap(curDirRootName);
- return;
- }
-
- if (rootName && !rootDirectory) {
- StringRef pRootName = path::root_name(p);
- StringRef bRootDirectory = path::root_directory(current_dir);
- StringRef bRelativePath = path::relative_path(current_dir);
- StringRef pRelativePath = path::relative_path(p);
-
- SmallString<128> res;
- path::append(res, pRootName, bRootDirectory, bRelativePath, pRelativePath);
- path.swap(res);
- return;
- }
-
- llvm_unreachable("All rootName and rootDirectory combinations should have "
- "occurred above!");
-}
-
std::error_code make_absolute(SmallVectorImpl<char> &path) {
if (path::is_absolute(path))
return {};
@@ -960,7 +960,7 @@ std::error_code make_absolute(SmallVectorImpl<char> &path) {
if (std::error_code ec = current_path(current_dir))
return ec;
- make_absolute(current_dir, path);
+ path::make_absolute(current_dir, path);
return {};
}
diff --git a/llvm/lib/Support/ScopedPrinter.cpp b/llvm/lib/Support/ScopedPrinter.cpp
index a17e397..efb6178 100644
--- a/llvm/lib/Support/ScopedPrinter.cpp
+++ b/llvm/lib/Support/ScopedPrinter.cpp
@@ -1,12 +1,17 @@
-#include "llvm/Support/ScopedPrinter.h"
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/Support/ScopedPrinter.h"
#include "llvm/Support/Format.h"
-using namespace llvm::support;
+using namespace llvm;
-namespace llvm {
-
-raw_ostream &operator<<(raw_ostream &OS, const HexNumber &Value) {
+raw_ostream &llvm::operator<<(raw_ostream &OS, const HexNumber &Value) {
OS << "0x" << utohexstr(Value.Value);
return OS;
}
@@ -45,5 +50,3 @@ JSONScopedPrinter::JSONScopedPrinter(
if (this->OuterScope)
this->OuterScope->setPrinter(*this);
}
-
-} // namespace llvm
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index 44d2ee7..c754b30 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -133,7 +133,7 @@ std::error_code FileSystem::makeAbsolute(SmallVectorImpl<char> &Path) const {
if (!WorkingDir)
return WorkingDir.getError();
- llvm::sys::fs::make_absolute(WorkingDir.get(), Path);
+ sys::path::make_absolute(WorkingDir.get(), Path);
return {};
}
@@ -300,7 +300,7 @@ private:
if (!WD || !*WD)
return Path;
Path.toVector(Storage);
- sys::fs::make_absolute(WD->get().Resolved, Storage);
+ sys::path::make_absolute(WD->get().Resolved, Storage);
return Storage;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 92a587b..280fbe2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1384,6 +1384,11 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments)
addPass(createAMDGPULowerKernelArgumentsPass());
+ TargetPassConfig::addCodeGenPrepare();
+
+ if (isPassEnabled(EnableLoadStoreVectorizer))
+ addPass(createLoadStoreVectorizerPass());
+
if (TM->getTargetTriple().isAMDGCN()) {
// This lowering has been placed after codegenprepare to take advantage of
// address mode matching (which is why it isn't put with the LDS lowerings).
@@ -1392,15 +1397,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
// but has been put before switch lowering and CFG flattening so that those
// passes can run on the more optimized control flow this pass creates in
// many cases.
- //
- // FIXME: This should ideally be put after the LoadStoreVectorizer.
- // However, due to some annoying facts about ResourceUsageAnalysis,
- // (especially as exercised in the resource-usage-dead-function test),
- // we need all the function passes codegenprepare all the way through
- // said resource usage analysis to run on the call graph produced
- // before codegenprepare runs (because codegenprepare will knock some
- // nodes out of the graph, which leads to function-level passes not
- // being run on them, which causes crashes in the resource usage analysis).
addPass(createAMDGPULowerBufferFatPointersPass());
addPass(createAMDGPULowerIntrinsicsLegacyPass());
// In accordance with the above FIXME, manually force all the
@@ -1408,11 +1404,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
addPass(new DummyCGSCCPass());
}
- TargetPassConfig::addCodeGenPrepare();
-
- if (isPassEnabled(EnableLoadStoreVectorizer))
- addPass(createLoadStoreVectorizerPass());
-
// LowerSwitch pass may introduce unreachable blocks that can
// cause unexpected behavior for subsequent passes. Placing it
// here seems better that these blocks would get cleaned up by
@@ -2125,6 +2116,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
if (EnableLowerKernelArguments)
addPass(AMDGPULowerKernelArgumentsPass(TM));
+ Base::addCodeGenPrepare(addPass);
+
+ if (isPassEnabled(EnableLoadStoreVectorizer))
+ addPass(LoadStoreVectorizerPass());
+
// This lowering has been placed after codegenprepare to take advantage of
// address mode matching (which is why it isn't put with the LDS lowerings).
// It could be placed anywhere before uniformity annotations (an analysis
@@ -2132,25 +2128,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
// but has been put before switch lowering and CFG flattening so that those
// passes can run on the more optimized control flow this pass creates in
// many cases.
- //
- // FIXME: This should ideally be put after the LoadStoreVectorizer.
- // However, due to some annoying facts about ResourceUsageAnalysis,
- // (especially as exercised in the resource-usage-dead-function test),
- // we need all the function passes codegenprepare all the way through
- // said resource usage analysis to run on the call graph produced
- // before codegenprepare runs (because codegenprepare will knock some
- // nodes out of the graph, which leads to function-level passes not
- // being run on them, which causes crashes in the resource usage analysis).
addPass(AMDGPULowerBufferFatPointersPass(TM));
addPass.requireCGSCCOrder();
addPass(AMDGPULowerIntrinsicsPass(TM));
- Base::addCodeGenPrepare(addPass);
-
- if (isPassEnabled(EnableLoadStoreVectorizer))
- addPass(LoadStoreVectorizerPass());
-
// LowerSwitch pass may introduce unreachable blocks that can cause unexpected
// behavior for subsequent passes. Placing it here seems better that these
// blocks would get cleaned up by UnreachableBlockElim inserted next in the
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 77df721..54f57e0 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -314,9 +314,10 @@ let SubtargetPredicate = HasGFX950Insts, OtherPredicates = [HasBF16ConversionIns
defm V_CVT_F32_BF16 : VOP1Inst_t16 <"v_cvt_f32_bf16", VOP_F32_BF16>;
}
let SubtargetPredicate = isGFX1250Plus, OtherPredicates = [HasBF16ConversionInsts] in {
- defm V_CVT_F32_BF16_gfx1250 : VOP1Inst_t16_with_profiles <"v_cvt_f32_bf16_gfx1250", VOP_F32_BF16,
- VOPProfile_CVT_F32_BF16_gfx1250_t16,
- VOPProfile_CVT_F32_BF16_gfx1250_fake16>;
+ let True16Predicate = UseRealTrue16Insts in
+ defm V_CVT_F32_BF16_gfx1250_t16 : VOP1Inst <"V_CVT_F32_BF16_gfx1250_t16", VOPProfile_CVT_F32_BF16_gfx1250_t16>;
+ let True16Predicate = UseFakeTrue16Insts in
+ defm V_CVT_F32_BF16_gfx1250_fake16 : VOP1Inst <"V_CVT_F32_BF16_gfx1250_fake16", VOPProfile_CVT_F32_BF16_gfx1250_fake16>;
}
let ReadsModeReg = 0, mayRaiseFPException = 0 in {
@@ -899,6 +900,7 @@ class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p =
let DecoderNamespace = Gen.DecoderNamespace;
let OtherPredicates = !listconcat(ps.OtherPredicates,
!if(p.HasExt64BitDPP, [HasDPALU_DPP], []));
+ let True16Predicate = ps.True16Predicate;
}
class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
@@ -921,6 +923,7 @@ class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pf
VOP1_DPP8<op, ps, p> {
let AssemblerPredicate = Gen.AssemblerPredicate;
let DecoderNamespace = Gen.DecoderNamespace;
+ let True16Predicate = ps.True16Predicate;
}
//===----------------------------------------------------------------------===//
@@ -1149,7 +1152,7 @@ defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>;
defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>;
defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>;
-defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
+defm V_CVT_F32_BF16_gfx1250 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16">;
defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>;
defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>;
defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index d4124ae..ee25f69 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -3139,8 +3139,8 @@ bool RISCVTTIImpl::isProfitableToSinkOperands(
bool IsVPSplat = match(Op, m_Intrinsic<Intrinsic::experimental_vp_splat>(
m_Value(), m_Value(), m_Value()));
if (!IsVPSplat &&
- !match(Op, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
- m_Undef(), m_ZeroMask())))
+ !match(Op, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),
+ m_Value(), m_ZeroMask())))
continue;
// Don't sink i1 splats.
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index ad7e503..cf85691 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -27,7 +27,7 @@
#include "llvm/Target/TargetLoweringObjectFile.h"
using namespace llvm;
-cl::opt<bool> NoKernelInfoEndLTO(
+cl::opt<bool> llvm::NoKernelInfoEndLTO(
"no-kernel-info-end-lto",
cl::desc("remove the kernel-info pass at the end of the full LTO pipeline"),
cl::init(false), cl::Hidden);
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 83aa7de..28ee444 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -72,6 +72,7 @@ STATISTIC(NumImportedModules, "Number of modules imported from");
STATISTIC(NumDeadSymbols, "Number of dead stripped symbols in index");
STATISTIC(NumLiveSymbols, "Number of live symbols in index");
+namespace llvm {
cl::opt<bool>
ForceImportAll("force-import-all", cl::init(false), cl::Hidden,
cl::desc("Import functions with noinline attribute"));
@@ -185,9 +186,8 @@ static cl::opt<bool> CtxprofMoveRootsToOwnModule(
extern cl::list<GlobalValue::GUID> MoveSymbolGUID;
-namespace llvm {
extern cl::opt<bool> EnableMemProfContextDisambiguation;
-}
+} // end namespace llvm
// Load lazily a module from \p FileName in \p Context.
static std::unique_ptr<Module> loadFile(const std::string &FileName,
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 4f53738..150a2dc 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -28,10 +28,13 @@ using namespace llvm;
STATISTIC(NumSpecsCreated, "Number of specializations created");
+namespace llvm {
+
static cl::opt<bool> ForceSpecialization(
- "force-specialization", cl::init(false), cl::Hidden, cl::desc(
- "Force function specialization for every call site with a constant "
- "argument"));
+ "force-specialization", cl::init(false), cl::Hidden,
+ cl::desc(
+ "Force function specialization for every call site with a constant "
+ "argument"));
static cl::opt<unsigned> MaxClones(
"funcspec-max-clones", cl::init(3), cl::Hidden, cl::desc(
@@ -91,6 +94,8 @@ static cl::opt<bool> SpecializeLiteralConstant(
extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+} // end namespace llvm
+
bool InstCostVisitor::canEliminateSuccessor(BasicBlock *BB,
BasicBlock *Succ) const {
unsigned I = 0;
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 15f4d76..c4f1b68 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -214,11 +214,12 @@ static cl::opt<bool> MemProfRequireDefinitionForPromotion(
"memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
cl::desc(
"Require target function definition when promoting indirect calls"));
-} // namespace llvm
extern cl::opt<bool> MemProfReportHintedSizes;
extern cl::opt<unsigned> MinClonedColdBytePercent;
+} // namespace llvm
+
namespace {
/// CRTP base for graphs built from either IR or ThinLTO summary index.
///
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 99b8b88..e39e311 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -116,6 +116,8 @@ STATISTIC(
NumCSInlinedHitGrowthLimit,
"Number of functions with FDO inline stopped due to growth size limit");
+namespace llvm {
+
// Command line option to specify the file to read samples from. This is
// mainly used for debugging.
static cl::opt<std::string> SampleProfileFile(
@@ -198,7 +200,6 @@ static cl::opt<bool> DisableSampleLoaderInlining(
"pass, and merge (or scale) profiles (as configured by "
"--sample-profile-merge-inlinee)."));
-namespace llvm {
cl::opt<bool>
SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden,
cl::desc("Sort profiled recursion by edge weights."));
diff --git a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
index 093a39e..70b8614 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
@@ -23,6 +23,8 @@ using namespace sampleprof;
#define DEBUG_TYPE "sample-profile-matcher"
+namespace llvm {
+
static cl::opt<unsigned> FuncProfileSimilarityThreshold(
"func-profile-similarity-threshold", cl::Hidden, cl::init(80),
cl::desc("Consider a profile matches a function if the similarity of their "
@@ -55,6 +57,8 @@ static cl::opt<unsigned> SalvageStaleProfileMaxCallsites(
cl::desc("The maximum number of callsites in a function, above which stale "
"profile matching will be skipped."));
+} // end namespace llvm
+
void SampleProfileMatcher::findIRAnchors(const Function &F,
AnchorMap &IRAnchors) const {
// For inlined code, recover the original callsite and callee by finding the
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 09bffa7..ac41fdd 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -120,6 +120,8 @@ STATISTIC(NumVirtConstProp1Bit,
"Number of 1 bit virtual constant propagations");
STATISTIC(NumVirtConstProp, "Number of virtual constant propagations");
+namespace llvm {
+
static cl::opt<PassSummaryAction> ClSummaryAction(
"wholeprogramdevirt-summary-action",
cl::desc("What to do with the summary when running this pass"),
@@ -175,6 +177,8 @@ static cl::list<std::string>
extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+} // end namespace llvm
+
/// With Clang, a pure virtual class's deleting destructor is emitted as a
/// `llvm.trap` intrinsic followed by an unreachable IR instruction. In the
/// context of whole program devirtualization, the deleting destructor of a pure
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 6ef3066..18a45c6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -319,20 +319,20 @@ Instruction *InstCombinerImpl::foldBitcastExtElt(ExtractElementInst &Ext) {
return nullptr;
}
-/// Find elements of V demanded by UserInstr.
-static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {
+/// Find elements of V demanded by UserInstr. If returns false, we were not able
+/// to determine all elements.
+static bool findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr,
+ APInt &UnionUsedElts) {
unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
- // Conservatively assume that all elements are needed.
- APInt UsedElts(APInt::getAllOnes(VWidth));
-
switch (UserInstr->getOpcode()) {
case Instruction::ExtractElement: {
ExtractElementInst *EEI = cast<ExtractElementInst>(UserInstr);
assert(EEI->getVectorOperand() == V);
ConstantInt *EEIIndexC = dyn_cast<ConstantInt>(EEI->getIndexOperand());
if (EEIIndexC && EEIIndexC->getValue().ult(VWidth)) {
- UsedElts = APInt::getOneBitSet(VWidth, EEIIndexC->getZExtValue());
+ UnionUsedElts.setBit(EEIIndexC->getZExtValue());
+ return true;
}
break;
}
@@ -341,23 +341,23 @@ static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {
unsigned MaskNumElts =
cast<FixedVectorType>(UserInstr->getType())->getNumElements();
- UsedElts = APInt(VWidth, 0);
- for (unsigned i = 0; i < MaskNumElts; i++) {
- unsigned MaskVal = Shuffle->getMaskValue(i);
+ for (auto I : llvm::seq(MaskNumElts)) {
+ unsigned MaskVal = Shuffle->getMaskValue(I);
if (MaskVal == -1u || MaskVal >= 2 * VWidth)
continue;
if (Shuffle->getOperand(0) == V && (MaskVal < VWidth))
- UsedElts.setBit(MaskVal);
+ UnionUsedElts.setBit(MaskVal);
if (Shuffle->getOperand(1) == V &&
((MaskVal >= VWidth) && (MaskVal < 2 * VWidth)))
- UsedElts.setBit(MaskVal - VWidth);
+ UnionUsedElts.setBit(MaskVal - VWidth);
}
- break;
+ return true;
}
default:
break;
}
- return UsedElts;
+
+ return false;
}
/// Find union of elements of V demanded by all its users.
@@ -370,7 +370,8 @@ static APInt findDemandedEltsByAllUsers(Value *V) {
APInt UnionUsedElts(VWidth, 0);
for (const Use &U : V->uses()) {
if (Instruction *I = dyn_cast<Instruction>(U.getUser())) {
- UnionUsedElts |= findDemandedEltsBySingleUser(V, I);
+ if (!findDemandedEltsBySingleUser(V, I, UnionUsedElts))
+ return APInt::getAllOnes(VWidth);
} else {
UnionUsedElts = APInt::getAllOnes(VWidth);
break;
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 5d2d79e..917004c 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -132,9 +132,11 @@ STATISTIC(NumReassoc , "Number of reassociations");
DEBUG_COUNTER(VisitCounter, "instcombine-visit",
"Controls which instructions are visited");
-static cl::opt<bool>
-EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"),
- cl::init(true));
+namespace llvm {
+
+static cl::opt<bool> EnableCodeSinking("instcombine-code-sinking",
+ cl::desc("Enable code sinking"),
+ cl::init(true));
static cl::opt<unsigned> MaxSinkNumUsers(
"instcombine-max-sink-users", cl::init(32),
@@ -156,6 +158,8 @@ extern cl::opt<bool> ProfcheckDisableMetadataFixes;
static cl::opt<unsigned> ShouldLowerDbgDeclare("instcombine-lower-dbg-declare",
cl::Hidden, cl::init(true));
+} // end namespace llvm
+
std::optional<Instruction *>
InstCombiner::targetInstCombineIntrinsic(IntrinsicInst &II) {
// Handle target specific intrinsics
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 0249f21..cf87e35 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -55,11 +55,11 @@ using namespace llvm;
STATISTIC(NumOfPGOICallPromotion, "Number of indirect call promotions.");
STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites.");
+namespace llvm {
extern cl::opt<unsigned> MaxNumVTableAnnotations;
-namespace llvm {
extern cl::opt<bool> EnableVTableProfileUse;
-}
+} // namespace llvm
// Command line option to disable indirect-call promotion with the default as
// false. This is for debug purpose.
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index d9e850e..120c4f6 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -222,7 +222,6 @@ cl::opt<bool> NoPGOWarnMismatchComdatWeak(
cl::desc("The option is used to turn on/off "
"warnings about hash mismatch for comdat "
"or weak functions."));
-} // namespace llvm
// Command line option to enable/disable select instruction instrumentation.
static cl::opt<bool>
@@ -347,7 +346,6 @@ cl::list<std::string> CtxPGOSkipCallsiteInstrument(
extern cl::opt<unsigned> MaxNumVTableAnnotations;
-namespace llvm {
// Command line option to turn on CFG dot dump after profile annotation.
// Defined in Analysis/BlockFrequencyInfo.cpp: -pgo-view-counts
extern cl::opt<PGOViewCountsType> PGOViewCounts;
diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index 343bec3..a5f417a 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -54,6 +54,8 @@ using namespace llvm;
STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
+namespace llvm {
+
// The minimum call count to optimize memory intrinsic calls.
static cl::opt<unsigned>
MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::init(1000),
@@ -93,6 +95,8 @@ static cl::opt<unsigned>
MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128),
cl::desc("Optimize the memop size <= this value"));
+} // end namespace llvm
+
namespace {
static const char *getMIName(const MemIntrinsic *MI) {
diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
index a3d4e53..0534fdd 100644
--- a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
+++ b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
@@ -21,7 +21,9 @@
using namespace llvm;
using CandidateInfo = ValueProfileCollector::CandidateInfo;
+namespace llvm {
extern cl::opt<bool> MemOPOptMemcmpBcmp;
+} // end namespace llvm
///--------------------------- MemIntrinsicPlugin ------------------------------
class MemIntrinsicPlugin : public InstVisitor<MemIntrinsicPlugin> {
diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
index 2025fbb..36f9bb4 100644
--- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
@@ -26,6 +26,8 @@
using namespace llvm;
+namespace llvm {
+
static cl::opt<unsigned>
JumpTableSizeThreshold("jump-table-to-switch-size-threshold", cl::Hidden,
cl::desc("Only split jump tables with size less or "
@@ -43,6 +45,8 @@ static cl::opt<unsigned> FunctionSizeThreshold(
extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+} // end namespace llvm
+
#define DEBUG_TYPE "jump-table-to-switch"
namespace {
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index bab1f2a..9655173 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -116,6 +116,8 @@ STATISTIC(NumIntAssociationsHoisted,
STATISTIC(NumBOAssociationsHoisted, "Number of invariant BinaryOp expressions "
"reassociated and hoisted out of the loop");
+namespace llvm {
+
/// Memory promotion is enabled by default.
static cl::opt<bool>
DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false),
@@ -154,7 +156,7 @@ static cl::opt<unsigned> IntAssociationUpperLimit(
// which may not be precise, since optimizeUses is capped. The result is
// correct, but we may not get as "far up" as possible to get which access is
// clobbering the one queried.
-cl::opt<unsigned> llvm::SetLicmMssaOptCap(
+cl::opt<unsigned> SetLicmMssaOptCap(
"licm-mssa-optimization-cap", cl::init(100), cl::Hidden,
cl::desc("Enable imprecision in LICM in pathological cases, in exchange "
"for faster compile. Caps the MemorySSA clobbering calls."));
@@ -162,7 +164,7 @@ cl::opt<unsigned> llvm::SetLicmMssaOptCap(
// Experimentally, memory promotion carries less importance than sinking and
// hoisting. Limit when we do promotion when using MemorySSA, in order to save
// compile time.
-cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap(
+cl::opt<unsigned> SetLicmMssaNoAccForPromotionCap(
"licm-mssa-max-acc-promotion", cl::init(250), cl::Hidden,
cl::desc("[LICM & MemorySSA] When MSSA in LICM is disabled, this has no "
"effect. When MSSA in LICM is enabled, then this is the maximum "
@@ -171,6 +173,8 @@ cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap(
extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+} // end namespace llvm
+
static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
static bool isNotUsedOrFoldableInLoop(const Instruction &I, const Loop *CurLoop,
const LoopSafetyInfo *SafetyInfo,
diff --git a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
index 1a9e16b..d31154f 100644
--- a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -17,6 +17,8 @@
using namespace llvm;
+namespace llvm {
+
/// Uses the "source_filename" instead of a Module hash ID for the suffix of
/// promoted locals during LTO. NOTE: This requires that the source filename
/// has a unique name / path to avoid name collisions.
@@ -35,6 +37,8 @@ cl::list<GlobalValue::GUID> MoveSymbolGUID(
"used with the name of contextual profiling roots."),
cl::Hidden);
+} // end namespace llvm
+
FunctionImportGlobalProcessing::FunctionImportGlobalProcessing(
Module &M, const ModuleSummaryIndex &Index,
SetVector<GlobalValue *> *GlobalsToImport, bool ClearDSOLocalOnDeclarations)
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 4d1f768..8bba634 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -95,7 +95,9 @@ using namespace PatternMatch;
#define DEBUG_TYPE "simplifycfg"
-cl::opt<bool> llvm::RequireAndPreserveDomTree(
+namespace llvm {
+
+cl::opt<bool> RequireAndPreserveDomTree(
"simplifycfg-require-and-preserve-domtree", cl::Hidden,
cl::desc(
@@ -205,6 +207,8 @@ static cl::opt<unsigned> MaxJumpThreadingLiveBlocks(
extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+} // end namespace llvm
+
STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
STATISTIC(NumLinearMaps,
"Number of switch instructions turned into linear mapping");
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 12fb46d..e5d6c81 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5699,6 +5699,20 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
Worklist.push_back(InstOp);
}
+ auto UpdateMemOpUserCost = [this, VF](LoadInst *LI) {
+ // If there are direct memory op users of the newly scalarized load,
+ // their cost may have changed because there's no scalarization
+ // overhead for the operand. Update it.
+ for (User *U : LI->users()) {
+ if (!isa<LoadInst, StoreInst>(U))
+ continue;
+ if (getWideningDecision(cast<Instruction>(U), VF) != CM_Scalarize)
+ continue;
+ setWideningDecision(
+ cast<Instruction>(U), VF, CM_Scalarize,
+ getMemInstScalarizationCost(cast<Instruction>(U), VF));
+ }
+ };
for (auto *I : AddrDefs) {
if (isa<LoadInst>(I)) {
// Setting the desired widening decision should ideally be handled in
@@ -5708,21 +5722,24 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
InstWidening Decision = getWideningDecision(I, VF);
if (Decision == CM_Widen || Decision == CM_Widen_Reverse ||
(!isPredicatedInst(I) && !Legal->isUniformMemOp(*I, VF) &&
- Decision == CM_Scalarize))
+ Decision == CM_Scalarize)) {
// Scalarize a widened load of address or update the cost of a scalar
// load of an address.
setWideningDecision(
I, VF, CM_Scalarize,
(VF.getKnownMinValue() *
getMemoryInstructionCost(I, ElementCount::getFixed(1))));
- else if (const auto *Group = getInterleavedAccessGroup(I)) {
+ UpdateMemOpUserCost(cast<LoadInst>(I));
+ } else if (const auto *Group = getInterleavedAccessGroup(I)) {
// Scalarize an interleave group of address loads.
for (unsigned I = 0; I < Group->getFactor(); ++I) {
- if (Instruction *Member = Group->getMember(I))
+ if (Instruction *Member = Group->getMember(I)) {
setWideningDecision(
Member, VF, CM_Scalarize,
(VF.getKnownMinValue() *
getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
+ UpdateMemOpUserCost(cast<LoadInst>(Member));
+ }
}
}
} else {
@@ -9521,55 +9538,52 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
Header->setName("vec.epilog.vector.body");
- DenseMap<Value *, Value *> ToFrozen;
- SmallVector<Instruction *> InstsToMove;
// Ensure that the start values for all header phi recipes are updated before
// vectorizing the epilogue loop.
- for (VPRecipeBase &R : Header->phis()) {
- if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
- // When vectorizing the epilogue loop, the canonical induction start
- // value needs to be changed from zero to the value after the main
- // vector loop. Find the resume value created during execution of the main
- // VPlan. It must be the first phi in the loop preheader.
- // FIXME: Improve modeling for canonical IV start values in the epilogue
- // loop.
- using namespace llvm::PatternMatch;
- PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
- for (Value *Inc : EPResumeVal->incoming_values()) {
- if (match(Inc, m_SpecificInt(0)))
- continue;
- assert(!EPI.VectorTripCount &&
- "Must only have a single non-zero incoming value");
- EPI.VectorTripCount = Inc;
- }
- // If we didn't find a non-zero vector trip count, all incoming values
- // must be zero, which also means the vector trip count is zero. Pick the
- // first zero as vector trip count.
- // TODO: We should not choose VF * UF so the main vector loop is known to
- // be dead.
- if (!EPI.VectorTripCount) {
- assert(
- EPResumeVal->getNumIncomingValues() > 0 &&
- all_of(EPResumeVal->incoming_values(),
- [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) &&
- "all incoming values must be 0");
- EPI.VectorTripCount = EPResumeVal->getOperand(0);
- }
- VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
- assert(all_of(IV->users(),
- [](const VPUser *U) {
- return isa<VPScalarIVStepsRecipe>(U) ||
- isa<VPDerivedIVRecipe>(U) ||
- cast<VPRecipeBase>(U)->isScalarCast() ||
- cast<VPInstruction>(U)->getOpcode() ==
- Instruction::Add;
- }) &&
- "the canonical IV should only be used by its increment or "
- "ScalarIVSteps when resetting the start value");
- IV->setOperand(0, VPV);
+ VPCanonicalIVPHIRecipe *IV = Plan.getCanonicalIV();
+ // When vectorizing the epilogue loop, the canonical induction start
+ // value needs to be changed from zero to the value after the main
+ // vector loop. Find the resume value created during execution of the main
+ // VPlan. It must be the first phi in the loop preheader.
+ // FIXME: Improve modeling for canonical IV start values in the epilogue
+ // loop.
+ using namespace llvm::PatternMatch;
+ PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
+ for (Value *Inc : EPResumeVal->incoming_values()) {
+ if (match(Inc, m_SpecificInt(0)))
continue;
- }
+ assert(!EPI.VectorTripCount &&
+ "Must only have a single non-zero incoming value");
+ EPI.VectorTripCount = Inc;
+ }
+ // If we didn't find a non-zero vector trip count, all incoming values
+ // must be zero, which also means the vector trip count is zero. Pick the
+ // first zero as vector trip count.
+ // TODO: We should not choose VF * UF so the main vector loop is known to
+ // be dead.
+ if (!EPI.VectorTripCount) {
+ assert(EPResumeVal->getNumIncomingValues() > 0 &&
+ all_of(EPResumeVal->incoming_values(),
+ [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) &&
+ "all incoming values must be 0");
+ EPI.VectorTripCount = EPResumeVal->getOperand(0);
+ }
+ VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
+ assert(all_of(IV->users(),
+ [](const VPUser *U) {
+ return isa<VPScalarIVStepsRecipe>(U) ||
+ isa<VPDerivedIVRecipe>(U) ||
+ cast<VPRecipeBase>(U)->isScalarCast() ||
+ cast<VPInstruction>(U)->getOpcode() ==
+ Instruction::Add;
+ }) &&
+ "the canonical IV should only be used by its increment or "
+ "ScalarIVSteps when resetting the start value");
+ IV->setOperand(0, VPV);
+ DenseMap<Value *, Value *> ToFrozen;
+ SmallVector<Instruction *> InstsToMove;
+ for (VPRecipeBase &R : drop_begin(Header->phis())) {
Value *ResumeV = nullptr;
// TODO: Move setting of resume values to prepareToExecute.
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f77d587..fedca65 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2241,10 +2241,9 @@ public:
/// TODO: If load combining is allowed in the IR optimizer, this analysis
/// may not be necessary.
bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
- bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
- ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
- const DataLayout &DL, ScalarEvolution &SE,
- const int64_t Diff, StridedPtrInfo &SPtrInfo) const;
+ bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
+ Align Alignment, const int64_t Diff, Value *Ptr0,
+ Value *PtrN, StridedPtrInfo &SPtrInfo) const;
/// Checks if the given array of loads can be represented as a vectorized,
/// scatter or just simple gather.
@@ -6824,13 +6823,10 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
/// 4. Any pointer operand is an instruction with the users outside of the
/// current graph (for masked gathers extra extractelement instructions
/// might be required).
-bool BoUpSLP::isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
- ArrayRef<unsigned> Order,
- const TargetTransformInfo &TTI,
- const DataLayout &DL, ScalarEvolution &SE,
- const int64_t Diff,
- StridedPtrInfo &SPtrInfo) const {
- const size_t Sz = VL.size();
+bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
+ Align Alignment, const int64_t Diff, Value *Ptr0,
+ Value *PtrN, StridedPtrInfo &SPtrInfo) const {
+ const size_t Sz = PointerOps.size();
if (Diff % (Sz - 1) != 0)
return false;
@@ -6842,7 +6838,6 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
});
const uint64_t AbsoluteDiff = std::abs(Diff);
- Type *ScalarTy = VL.front()->getType();
auto *VecTy = getWidenedType(ScalarTy, Sz);
if (IsAnyPointerUsedOutGraph ||
(AbsoluteDiff > Sz &&
@@ -6853,20 +6848,9 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
if (Diff != Stride * static_cast<int64_t>(Sz - 1))
return false;
- Align Alignment =
- cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
- ->getAlign();
- if (!TTI.isLegalStridedLoadStore(VecTy, Alignment))
+ if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
return false;
- Value *Ptr0;
- Value *PtrN;
- if (Order.empty()) {
- Ptr0 = PointerOps.front();
- PtrN = PointerOps.back();
- } else {
- Ptr0 = PointerOps[Order.front()];
- PtrN = PointerOps[Order.back()];
- }
+
// Iterate through all pointers and check if all distances are
// unique multiple of Dist.
SmallSet<int64_t, 4> Dists;
@@ -6875,14 +6859,14 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
if (Ptr == PtrN)
Dist = Diff;
else if (Ptr != Ptr0)
- Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
+ Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
// If the strides are not the same or repeated, we can't
// vectorize.
if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
break;
}
if (Dists.size() == Sz) {
- Type *StrideTy = DL.getIndexType(Ptr0->getType());
+ Type *StrideTy = DL->getIndexType(Ptr0->getType());
SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
return true;
@@ -6971,7 +6955,11 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
cast<Instruction>(V), UserIgnoreList);
}))
return LoadsState::CompressVectorize;
- if (isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE, *Diff, SPtrInfo))
+ Align Alignment =
+ cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
+ ->getAlign();
+ if (isStridedLoad(PointerOps, ScalarTy, Alignment, *Diff, Ptr0, PtrN,
+ SPtrInfo))
return LoadsState::StridedVectorize;
}
if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index ffd2e59..02eb637 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -978,6 +978,16 @@ void VPlan::execute(VPTransformState *State) {
// If the original loop is unreachable, delete it and all its blocks.
if (!ScalarPhVPBB->hasPredecessors()) {
+ // DeleteDeadBlocks will remove single-entry phis. Remove them from the exit
+ // VPIRBBs in VPlan as well, otherwise we would retain references to deleted
+ // IR instructions.
+ for (VPIRBasicBlock *EB : getExitBlocks()) {
+ for (VPRecipeBase &R : make_early_inc_range(EB->phis())) {
+ if (R.getNumOperands() == 1)
+ R.eraseFromParent();
+ }
+ }
+
Loop *OrigLoop =
State->LI->getLoopFor(getScalarHeader()->getIRBasicBlock());
auto Blocks = OrigLoop->getBlocksVector();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a73b083..acdb379 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -40,7 +40,7 @@
using namespace llvm;
using namespace VPlanPatternMatch;
-cl::opt<bool> EnableWideActiveLaneMask(
+static cl::opt<bool> EnableWideActiveLaneMask(
"enable-wide-lane-mask", cl::init(false), cl::Hidden,
cl::desc("Enable use of wide get active lane mask instructions"));
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 8399292..6f98eae 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -507,10 +507,14 @@ if(build_runtimes)
endif()
# Forward user-provived system configuration to runtimes for requirement introspection.
- # CMAKE_PREFIX_PATH is the search path for CMake packages.
+ # CMAKE_PREFIX_PATH is the search path for CMake packages. In order to pass through
+ # the command line interface, the CMake semicolon separator needs to be replaced
+ # with $<SEMICOLON>
if(CMAKE_PREFIX_PATH)
- list(APPEND extra_cmake_args "-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}")
+ string(JOIN "$<SEMICOLON>" escaped_cmake_prefix_path ${CMAKE_PREFIX_PATH})
+ list(APPEND extra_cmake_args "-DCMAKE_PREFIX_PATH=${escaped_cmake_prefix_path}")
endif()
+
# CMAKE_PROGRAM_PATH is the search path for executables such as python.
if(CMAKE_PROGRAM_PATH)
list(APPEND extra_cmake_args "-DCMAKE_PROGRAM_PATH=${CMAKE_PROGRAM_PATH}")
diff --git a/llvm/test/Analysis/IR2Vec/Inputs/dummy_2D_vocab.json b/llvm/test/Analysis/IR2Vec/Inputs/dummy_2D_vocab.json
index 07fde84..ae36ff5 100644
--- a/llvm/test/Analysis/IR2Vec/Inputs/dummy_2D_vocab.json
+++ b/llvm/test/Analysis/IR2Vec/Inputs/dummy_2D_vocab.json
@@ -87,6 +87,32 @@
"Function": [1, 2],
"Pointer": [3, 4],
"Constant": [5, 6],
- "Variable": [7, 8]
+ "Variable": [7, 8],
+ "FCMP_false": [9, 10],
+ "FCMP_oeq": [11, 12],
+ "FCMP_ogt": [13, 14],
+ "FCMP_oge": [15, 16],
+ "FCMP_olt": [17, 18],
+ "FCMP_ole": [19, 20],
+ "FCMP_one": [21, 22],
+ "FCMP_ord": [23, 24],
+ "FCMP_uno": [25, 26],
+ "FCMP_ueq": [27, 28],
+ "FCMP_ugt": [29, 30],
+ "FCMP_uge": [31, 32],
+ "FCMP_ult": [33, 34],
+ "FCMP_ule": [35, 36],
+ "FCMP_une": [37, 38],
+ "FCMP_true": [39, 40],
+ "ICMP_eq": [41, 42],
+ "ICMP_ne": [43, 44],
+ "ICMP_ugt": [45, 46],
+ "ICMP_uge": [47, 48],
+ "ICMP_ult": [49, 50],
+ "ICMP_ule": [51, 52],
+ "ICMP_sgt": [53, 54],
+ "ICMP_sge": [55, 56],
+ "ICMP_slt": [57, 58],
+ "ICMP_sle": [59, 60]
}
}
diff --git a/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_arg_vocab.json b/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_arg_vocab.json
index 932b3a2..9003dc7 100644
--- a/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_arg_vocab.json
+++ b/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_arg_vocab.json
@@ -86,6 +86,32 @@
"Function": [1, 2, 3],
"Pointer": [4, 5, 6],
"Constant": [7, 8, 9],
- "Variable": [10, 11, 12]
+ "Variable": [10, 11, 12],
+ "FCMP_false": [13, 14, 15],
+ "FCMP_oeq": [16, 17, 18],
+ "FCMP_ogt": [19, 20, 21],
+ "FCMP_oge": [22, 23, 24],
+ "FCMP_olt": [25, 26, 27],
+ "FCMP_ole": [28, 29, 30],
+ "FCMP_one": [31, 32, 33],
+ "FCMP_ord": [34, 35, 36],
+ "FCMP_uno": [37, 38, 39],
+ "FCMP_ueq": [40, 41, 42],
+ "FCMP_ugt": [43, 44, 45],
+ "FCMP_uge": [46, 47, 48],
+ "FCMP_ult": [49, 50, 51],
+ "FCMP_ule": [52, 53, 54],
+ "FCMP_une": [55, 56, 57],
+ "FCMP_true": [58, 59, 60],
+ "ICMP_eq": [61, 62, 63],
+ "ICMP_ne": [64, 65, 66],
+ "ICMP_ugt": [67, 68, 69],
+ "ICMP_uge": [70, 71, 72],
+ "ICMP_ult": [73, 74, 75],
+ "ICMP_ule": [76, 77, 78],
+ "ICMP_sgt": [79, 80, 81],
+ "ICMP_sge": [82, 83, 84],
+ "ICMP_slt": [85, 86, 87],
+ "ICMP_sle": [88, 89, 90]
}
}
diff --git a/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json b/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
index 19f3efe..7ef8549 100644
--- a/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
+++ b/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
@@ -47,6 +47,7 @@
"FPTrunc": [133, 134, 135],
"FPExt": [136, 137, 138],
"PtrToInt": [139, 140, 141],
+ "PtrToAddr": [202, 203, 204],
"IntToPtr": [142, 143, 144],
"BitCast": [145, 146, 147],
"AddrSpaceCast": [148, 149, 150],
@@ -86,6 +87,32 @@
"Function": [0, 0, 0],
"Pointer": [0, 0, 0],
"Constant": [0, 0, 0],
- "Variable": [0, 0, 0]
+ "Variable": [0, 0, 0],
+ "FCMP_false": [0, 0, 0],
+ "FCMP_oeq": [0, 0, 0],
+ "FCMP_ogt": [0, 0, 0],
+ "FCMP_oge": [0, 0, 0],
+ "FCMP_olt": [0, 0, 0],
+ "FCMP_ole": [0, 0, 0],
+ "FCMP_one": [0, 0, 0],
+ "FCMP_ord": [0, 0, 0],
+ "FCMP_uno": [0, 0, 0],
+ "FCMP_ueq": [0, 0, 0],
+ "FCMP_ugt": [0, 0, 0],
+ "FCMP_uge": [0, 0, 0],
+ "FCMP_ult": [0, 0, 0],
+ "FCMP_ule": [0, 0, 0],
+ "FCMP_une": [0, 0, 0],
+ "FCMP_true": [0, 0, 0],
+ "ICMP_eq": [0, 0, 0],
+ "ICMP_ne": [0, 0, 0],
+ "ICMP_ugt": [0, 0, 0],
+ "ICMP_uge": [0, 0, 0],
+ "ICMP_ult": [0, 0, 0],
+ "ICMP_ule": [0, 0, 0],
+ "ICMP_sgt": [1, 1, 1],
+ "ICMP_sge": [0, 0, 0],
+ "ICMP_slt": [0, 0, 0],
+ "ICMP_sle": [0, 0, 0]
}
}
diff --git a/llvm/test/Analysis/IR2Vec/Inputs/reference_default_vocab_print.txt b/llvm/test/Analysis/IR2Vec/Inputs/reference_default_vocab_print.txt
index df7769c..d62b0dd 100644
--- a/llvm/test/Analysis/IR2Vec/Inputs/reference_default_vocab_print.txt
+++ b/llvm/test/Analysis/IR2Vec/Inputs/reference_default_vocab_print.txt
@@ -82,3 +82,29 @@ Key: Function: [ 0.20 0.40 ]
Key: Pointer: [ 0.60 0.80 ]
Key: Constant: [ 1.00 1.20 ]
Key: Variable: [ 1.40 1.60 ]
+Key: FCMP_false: [ 1.80 2.00 ]
+Key: FCMP_oeq: [ 2.20 2.40 ]
+Key: FCMP_ogt: [ 2.60 2.80 ]
+Key: FCMP_oge: [ 3.00 3.20 ]
+Key: FCMP_olt: [ 3.40 3.60 ]
+Key: FCMP_ole: [ 3.80 4.00 ]
+Key: FCMP_one: [ 4.20 4.40 ]
+Key: FCMP_ord: [ 4.60 4.80 ]
+Key: FCMP_uno: [ 5.00 5.20 ]
+Key: FCMP_ueq: [ 5.40 5.60 ]
+Key: FCMP_ugt: [ 5.80 6.00 ]
+Key: FCMP_uge: [ 6.20 6.40 ]
+Key: FCMP_ult: [ 6.60 6.80 ]
+Key: FCMP_ule: [ 7.00 7.20 ]
+Key: FCMP_une: [ 7.40 7.60 ]
+Key: FCMP_true: [ 7.80 8.00 ]
+Key: ICMP_eq: [ 8.20 8.40 ]
+Key: ICMP_ne: [ 8.60 8.80 ]
+Key: ICMP_ugt: [ 9.00 9.20 ]
+Key: ICMP_uge: [ 9.40 9.60 ]
+Key: ICMP_ult: [ 9.80 10.00 ]
+Key: ICMP_ule: [ 10.20 10.40 ]
+Key: ICMP_sgt: [ 10.60 10.80 ]
+Key: ICMP_sge: [ 11.00 11.20 ]
+Key: ICMP_slt: [ 11.40 11.60 ]
+Key: ICMP_sle: [ 11.80 12.00 ]
diff --git a/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd1_vocab_print.txt b/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd1_vocab_print.txt
index f3ce809..e443adb 100644
--- a/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd1_vocab_print.txt
+++ b/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd1_vocab_print.txt
@@ -82,3 +82,29 @@ Key: Function: [ 0.50 1.00 ]
Key: Pointer: [ 1.50 2.00 ]
Key: Constant: [ 2.50 3.00 ]
Key: Variable: [ 3.50 4.00 ]
+Key: FCMP_false: [ 4.50 5.00 ]
+Key: FCMP_oeq: [ 5.50 6.00 ]
+Key: FCMP_ogt: [ 6.50 7.00 ]
+Key: FCMP_oge: [ 7.50 8.00 ]
+Key: FCMP_olt: [ 8.50 9.00 ]
+Key: FCMP_ole: [ 9.50 10.00 ]
+Key: FCMP_one: [ 10.50 11.00 ]
+Key: FCMP_ord: [ 11.50 12.00 ]
+Key: FCMP_uno: [ 12.50 13.00 ]
+Key: FCMP_ueq: [ 13.50 14.00 ]
+Key: FCMP_ugt: [ 14.50 15.00 ]
+Key: FCMP_uge: [ 15.50 16.00 ]
+Key: FCMP_ult: [ 16.50 17.00 ]
+Key: FCMP_ule: [ 17.50 18.00 ]
+Key: FCMP_une: [ 18.50 19.00 ]
+Key: FCMP_true: [ 19.50 20.00 ]
+Key: ICMP_eq: [ 20.50 21.00 ]
+Key: ICMP_ne: [ 21.50 22.00 ]
+Key: ICMP_ugt: [ 22.50 23.00 ]
+Key: ICMP_uge: [ 23.50 24.00 ]
+Key: ICMP_ult: [ 24.50 25.00 ]
+Key: ICMP_ule: [ 25.50 26.00 ]
+Key: ICMP_sgt: [ 26.50 27.00 ]
+Key: ICMP_sge: [ 27.50 28.00 ]
+Key: ICMP_slt: [ 28.50 29.00 ]
+Key: ICMP_sle: [ 29.50 30.00 ]
diff --git a/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd2_vocab_print.txt b/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd2_vocab_print.txt
index 72b25b9..7fb6043 100644
--- a/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd2_vocab_print.txt
+++ b/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd2_vocab_print.txt
@@ -82,3 +82,29 @@ Key: Function: [ 0.00 0.00 ]
Key: Pointer: [ 0.00 0.00 ]
Key: Constant: [ 0.00 0.00 ]
Key: Variable: [ 0.00 0.00 ]
+Key: FCMP_false: [ 0.00 0.00 ]
+Key: FCMP_oeq: [ 0.00 0.00 ]
+Key: FCMP_ogt: [ 0.00 0.00 ]
+Key: FCMP_oge: [ 0.00 0.00 ]
+Key: FCMP_olt: [ 0.00 0.00 ]
+Key: FCMP_ole: [ 0.00 0.00 ]
+Key: FCMP_one: [ 0.00 0.00 ]
+Key: FCMP_ord: [ 0.00 0.00 ]
+Key: FCMP_uno: [ 0.00 0.00 ]
+Key: FCMP_ueq: [ 0.00 0.00 ]
+Key: FCMP_ugt: [ 0.00 0.00 ]
+Key: FCMP_uge: [ 0.00 0.00 ]
+Key: FCMP_ult: [ 0.00 0.00 ]
+Key: FCMP_ule: [ 0.00 0.00 ]
+Key: FCMP_une: [ 0.00 0.00 ]
+Key: FCMP_true: [ 0.00 0.00 ]
+Key: ICMP_eq: [ 0.00 0.00 ]
+Key: ICMP_ne: [ 0.00 0.00 ]
+Key: ICMP_ugt: [ 0.00 0.00 ]
+Key: ICMP_uge: [ 0.00 0.00 ]
+Key: ICMP_ult: [ 0.00 0.00 ]
+Key: ICMP_ule: [ 0.00 0.00 ]
+Key: ICMP_sgt: [ 0.00 0.00 ]
+Key: ICMP_sge: [ 0.00 0.00 ]
+Key: ICMP_slt: [ 0.00 0.00 ]
+Key: ICMP_sle: [ 0.00 0.00 ]
diff --git a/llvm/test/Analysis/IR2Vec/if-else.ll b/llvm/test/Analysis/IR2Vec/if-else.ll
index fe53247..804c1ca 100644
--- a/llvm/test/Analysis/IR2Vec/if-else.ll
+++ b/llvm/test/Analysis/IR2Vec/if-else.ll
@@ -29,7 +29,7 @@ return: ; preds = %if.else, %if.then
; CHECK: Basic block vectors:
; CHECK-NEXT: Basic block: entry:
-; CHECK-NEXT: [ 816.00 825.00 834.00 ]
+; CHECK-NEXT: [ 816.20 825.20 834.20 ]
; CHECK-NEXT: Basic block: if.then:
; CHECK-NEXT: [ 195.00 198.00 201.00 ]
; CHECK-NEXT: Basic block: if.else:
diff --git a/llvm/test/Analysis/IR2Vec/unreachable.ll b/llvm/test/Analysis/IR2Vec/unreachable.ll
index b0e3e49..9be0ee1 100644
--- a/llvm/test/Analysis/IR2Vec/unreachable.ll
+++ b/llvm/test/Analysis/IR2Vec/unreachable.ll
@@ -33,7 +33,7 @@ return: ; preds = %if.else, %if.then
; CHECK: Basic block vectors:
; CHECK-NEXT: Basic block: entry:
-; CHECK-NEXT: [ 816.00 825.00 834.00 ]
+; CHECK-NEXT: [ 816.20 825.20 834.20 ]
; CHECK-NEXT: Basic block: if.then:
; CHECK-NEXT: [ 195.00 198.00 201.00 ]
; CHECK-NEXT: Basic block: if.else:
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 4db7663..32c7c64 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -71,7 +71,6 @@ set(LLVM_TEST_DEPENDS
${LLVM_TEST_DEPENDS_COMMON}
BugpointPasses
LLVMWindowsDriver
- UnitTests
bugpoint
llc
lli
@@ -270,10 +269,11 @@ add_lit_testsuites(LLVM ${CMAKE_CURRENT_SOURCE_DIR}
${exclude_from_check_all}
DEPENDS ${LLVM_TEST_DEPENDS}
FOLDER "Tests/Subdirectories"
- SKIP "^FileCheck" "^TableGen"
+ SKIP "^FileCheck" "^TableGen" "^Unit"
)
add_subdirectory(FileCheck)
add_subdirectory(TableGen)
+add_subdirectory(Unit)
# Setup an alias for 'check-all'.
add_custom_target(check)
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index c3b14e8..323bffe 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -57,8 +57,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start
@@ -69,7 +68,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -96,9 +95,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -106,7 +104,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -123,9 +121,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -133,7 +130,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX8-NEXT: v_add_f32_e32 v4, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -150,9 +147,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s20
; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s6
+; GFX7-NEXT: v_mov_b32_e32 v3, s20
; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -160,7 +156,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
; GFX7-NEXT: v_mov_b32_e32 v0, v4
; GFX7-NEXT: v_mov_b32_e32 v1, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -245,8 +241,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
@@ -256,7 +251,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX10-NEXT: v_mov_b32_e32 v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -292,16 +287,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v1, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, v2
; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
@@ -318,16 +312,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s20
; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s6
+; GFX7-NEXT: v_mov_b32_e32 v3, s20
; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_f32_e32 v1, v2, v0
; GFX7-NEXT: v_mov_b32_e32 v5, v2
; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
@@ -468,7 +461,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
@@ -481,7 +473,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT: ; implicit-def: $vgpr4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB2_1
@@ -507,7 +498,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB2_4
@@ -556,7 +547,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -569,7 +559,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT: ; implicit-def: $vgpr4
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB2_1
; GFX908-NEXT: ; %bb.2:
@@ -594,7 +583,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB2_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
@@ -614,7 +603,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -627,7 +615,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT: ; implicit-def: $vgpr4
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB2_1
; GFX8-NEXT: ; %bb.2:
@@ -652,7 +639,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB2_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
@@ -672,7 +659,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -684,7 +670,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT: ; implicit-def: $vgpr4
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB2_1
; GFX7-NEXT: ; %bb.2:
@@ -709,7 +694,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB2_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
@@ -830,8 +815,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
@@ -842,7 +826,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -860,16 +844,15 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX90A-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -886,9 +869,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -896,7 +878,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -913,9 +895,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -923,7 +904,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX8-NEXT: v_add_f32_e32 v4, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -940,9 +921,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s20
; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s6
+; GFX7-NEXT: v_mov_b32_e32 v3, s20
; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -950,7 +930,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
; GFX7-NEXT: v_mov_b32_e32 v0, v4
; GFX7-NEXT: v_mov_b32_e32 v1, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1035,8 +1015,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start
@@ -1046,7 +1025,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX10-NEXT: v_mov_b32_e32 v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -1064,15 +1043,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f32_e32 v2, v3, v0
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -1089,16 +1066,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_add_f32_e32 v1, v2, v0
; GFX908-NEXT: v_mov_b32_e32 v5, v2
; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
@@ -1115,16 +1091,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v1, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, v2
; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
@@ -1141,16 +1116,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s20
; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s6
+; GFX7-NEXT: v_mov_b32_e32 v3, s20
; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_f32_e32 v1, v2, v0
; GFX7-NEXT: v_mov_b32_e32 v5, v2
; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
@@ -1223,9 +1197,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
+; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s16
; GFX11-NEXT: v_mov_b32_e32 v0, s16
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
@@ -1237,7 +1209,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v4, v5, v2
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -1255,8 +1227,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start
@@ -1267,7 +1238,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -1285,16 +1256,15 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX90A-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1311,9 +1281,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -1321,7 +1290,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1338,9 +1307,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1348,7 +1316,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX8-NEXT: v_add_f32_e32 v4, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1365,9 +1333,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s20
; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s6
+; GFX7-NEXT: v_mov_b32_e32 v3, s20
; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -1375,7 +1342,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
; GFX7-NEXT: v_mov_b32_e32 v0, v4
; GFX7-NEXT: v_mov_b32_e32 v1, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1448,9 +1415,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
+; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s16
; GFX11-NEXT: v_mov_b32_e32 v0, s16
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
@@ -1462,7 +1427,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v4, v5, v2
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -1480,8 +1445,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start
@@ -1492,7 +1456,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -1510,16 +1474,15 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX90A-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1536,9 +1499,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -1546,7 +1508,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1563,9 +1525,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1573,7 +1534,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX8-NEXT: v_add_f32_e32 v4, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1590,9 +1551,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s20
; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s6
+; GFX7-NEXT: v_mov_b32_e32 v3, s20
; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -1600,7 +1560,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
; GFX7-NEXT: v_mov_b32_e32 v0, v4
; GFX7-NEXT: v_mov_b32_e32 v1, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1673,9 +1633,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
+; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s16
; GFX11-NEXT: v_mov_b32_e32 v0, s16
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
@@ -1687,7 +1645,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v4, v5, v2
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -1705,8 +1663,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start
@@ -1717,7 +1674,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -1735,16 +1692,15 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX90A-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1761,9 +1717,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -1771,7 +1726,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1788,9 +1743,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1798,7 +1752,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX8-NEXT: v_add_f32_e32 v4, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1815,9 +1769,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s20
; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s6
+; GFX7-NEXT: v_mov_b32_e32 v3, s20
; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -1825,7 +1778,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
; GFX7-NEXT: v_mov_b32_e32 v0, v4
; GFX7-NEXT: v_mov_b32_e32 v1, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1883,9 +1836,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: v_mov_b32_e32 v6, s16
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
@@ -1897,7 +1848,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
@@ -1925,9 +1876,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x800
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: v_mov_b32_e32 v6, s16
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
@@ -1939,7 +1888,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -1958,10 +1907,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: s_add_i32 s4, s20, 0x800
-; GFX10-NEXT: v_mov_b32_e32 v6, s4
-; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX10-NEXT: v_mov_b32_e32 v6, s20
; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1973,7 +1921,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: v_mov_b32_e32 v2, v9
; GFX10-NEXT: v_mov_b32_e32 v3, v10
-; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -2001,9 +1949,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: v_mov_b32_e32 v5, v1
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v6, s6
+; GFX908-NEXT: v_mov_b32_e32 v6, s20
; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -2014,7 +1961,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2032,9 +1979,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: v_mov_b32_e32 v6, s20
; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -2045,7 +1991,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2063,9 +2009,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX7-NEXT: v_mov_b32_e32 v0, s20
; GFX7-NEXT: v_mov_b32_e32 v5, v1
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX7-NEXT: s_add_i32 s6, s20, 0x800
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
+; GFX7-NEXT: v_mov_b32_e32 v6, s20
; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -2076,7 +2021,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX7-NEXT: v_mov_b32_e32 v1, v8
; GFX7-NEXT: v_mov_b32_e32 v2, v9
; GFX7-NEXT: v_mov_b32_e32 v3, v10
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2133,9 +2078,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, s16
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: v_mov_b32_e32 v6, s16
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
@@ -2146,7 +2089,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
@@ -2174,9 +2117,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x800
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: v_mov_b32_e32 v6, s16
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
@@ -2187,7 +2128,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen offset:2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -2205,8 +2146,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x800
-; GFX10-NEXT: v_mov_b32_e32 v6, s4
+; GFX10-NEXT: v_mov_b32_e32 v6, s20
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
@@ -2218,7 +2158,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v8, v3
; GFX10-NEXT: v_mov_b32_e32 v7, v2
-; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -2246,9 +2186,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v2, s20
; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
-; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v6, s6
+; GFX908-NEXT: v_mov_b32_e32 v6, s20
; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -2257,7 +2196,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v9, v4
; GFX908-NEXT: v_mov_b32_e32 v8, v3
; GFX908-NEXT: v_mov_b32_e32 v7, v2
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
@@ -2275,9 +2214,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s20
; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
-; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: v_mov_b32_e32 v6, s20
; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -2286,7 +2224,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v9, v4
; GFX8-NEXT: v_mov_b32_e32 v8, v3
; GFX8-NEXT: v_mov_b32_e32 v7, v2
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
@@ -2304,9 +2242,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
-; GFX7-NEXT: s_add_i32 s6, s20, 0x800
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
+; GFX7-NEXT: v_mov_b32_e32 v6, s20
; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -2315,7 +2252,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX7-NEXT: v_mov_b32_e32 v9, v4
; GFX7-NEXT: v_mov_b32_e32 v8, v3
; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
@@ -2373,10 +2310,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4
; GFX12-NEXT: s_mov_b32 s1, exec_lo
; GFX12-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_readfirstlane_b32 s4, v9
; GFX12-NEXT: v_readfirstlane_b32 s5, v10
; GFX12-NEXT: v_readfirstlane_b32 s6, v7
@@ -2390,7 +2326,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
-; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2:
@@ -2420,7 +2355,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v4, s[4:7], null offen offset:2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
@@ -2474,22 +2409,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s4, v9
; GFX11-NEXT: v_readfirstlane_b32 s5, v10
; GFX11-NEXT: v_readfirstlane_b32 s6, v7
; GFX11-NEXT: v_readfirstlane_b32 s7, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
-; GFX11-NEXT: ; implicit-def: $vgpr4
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB10_1
; GFX11-NEXT: ; %bb.2:
@@ -2518,7 +2452,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v4, s[4:7], 0 offen offset:2048 glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB10_4
; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
@@ -2543,7 +2477,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX10-NEXT: v_mov_b32_e32 v7, v2
; GFX10-NEXT: v_mov_b32_e32 v10, v1
; GFX10-NEXT: v_mov_b32_e32 v9, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v15, 0x800, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
@@ -2556,7 +2489,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX10-NEXT: ; implicit-def: $vgpr4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB10_1
@@ -2584,7 +2516,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v4, s[8:11], 0 offen offset:2048 glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB10_4
@@ -2640,7 +2572,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX908-NEXT: v_mov_b32_e32 v7, v2
; GFX908-NEXT: v_mov_b32_e32 v10, v1
; GFX908-NEXT: v_mov_b32_e32 v9, v0
-; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v9
@@ -2653,7 +2584,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX908-NEXT: ; implicit-def: $vgpr4
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB10_1
; GFX908-NEXT: ; %bb.2:
@@ -2680,7 +2610,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v4, s[8:11], 0 offen offset:2048 glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB10_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
@@ -2704,7 +2634,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX8-NEXT: v_mov_b32_e32 v7, v2
; GFX8-NEXT: v_mov_b32_e32 v10, v1
; GFX8-NEXT: v_mov_b32_e32 v9, v0
-; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v9
@@ -2717,7 +2646,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX8-NEXT: ; implicit-def: $vgpr4
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB10_1
; GFX8-NEXT: ; %bb.2:
@@ -2744,7 +2672,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v4, s[8:11], 0 offen offset:2048 glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB10_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
@@ -2768,7 +2696,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX7-NEXT: v_mov_b32_e32 v7, v2
; GFX7-NEXT: v_mov_b32_e32 v10, v1
; GFX7-NEXT: v_mov_b32_e32 v9, v0
-; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x800, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v9
@@ -2780,7 +2707,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX7-NEXT: ; implicit-def: $vgpr4
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB10_1
; GFX7-NEXT: ; %bb.2:
@@ -2807,7 +2733,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v4, s[8:11], 0 offen offset:2048 glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB10_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
@@ -2903,9 +2829,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: v_mov_b32_e32 v6, s16
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
@@ -2917,7 +2841,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
@@ -2945,9 +2869,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x800
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: v_mov_b32_e32 v6, s16
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
@@ -2959,7 +2881,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -2978,10 +2900,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: s_add_i32 s4, s20, 0x800
-; GFX10-NEXT: v_mov_b32_e32 v6, s4
-; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX10-NEXT: v_mov_b32_e32 v6, s20
; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2993,7 +2914,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: v_mov_b32_e32 v2, v9
; GFX10-NEXT: v_mov_b32_e32 v3, v10
-; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -3012,9 +2933,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x800
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, s6
+; GFX90A-NEXT: v_mov_b32_e32 v6, s20
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -3022,7 +2942,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX90A-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
@@ -3040,9 +2960,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: v_mov_b32_e32 v5, v1
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v6, s6
+; GFX908-NEXT: v_mov_b32_e32 v6, s20
; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -3053,7 +2972,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -3071,9 +2990,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: v_mov_b32_e32 v6, s20
; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -3084,7 +3002,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -3102,9 +3020,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX7-NEXT: v_mov_b32_e32 v0, s20
; GFX7-NEXT: v_mov_b32_e32 v5, v1
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX7-NEXT: s_add_i32 s6, s20, 0x800
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
+; GFX7-NEXT: v_mov_b32_e32 v6, s20
; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -3115,7 +3032,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX7-NEXT: v_mov_b32_e32 v1, v8
; GFX7-NEXT: v_mov_b32_e32 v2, v9
; GFX7-NEXT: v_mov_b32_e32 v3, v10
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -3173,9 +3090,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: v_mov_b32_e32 v6, s16
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
@@ -3187,7 +3102,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
@@ -3215,9 +3130,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x800
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: v_mov_b32_e32 v6, s16
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start
@@ -3229,7 +3142,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -3248,10 +3161,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: s_add_i32 s4, s20, 0x800
-; GFX10-NEXT: v_mov_b32_e32 v6, s4
-; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX10-NEXT: v_mov_b32_e32 v6, s20
; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -3263,7 +3175,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: v_mov_b32_e32 v2, v9
; GFX10-NEXT: v_mov_b32_e32 v3, v10
-; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -3291,9 +3203,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: v_mov_b32_e32 v5, v1
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v6, s6
+; GFX908-NEXT: v_mov_b32_e32 v6, s20
; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -3304,7 +3215,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -3322,9 +3233,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: v_mov_b32_e32 v6, s20
; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -3335,7 +3245,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -3353,9 +3263,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX7-NEXT: v_mov_b32_e32 v0, s20
; GFX7-NEXT: v_mov_b32_e32 v5, v1
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX7-NEXT: s_add_i32 s6, s20, 0x800
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
+; GFX7-NEXT: v_mov_b32_e32 v6, s20
; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -3366,7 +3275,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX7-NEXT: v_mov_b32_e32 v1, v8
; GFX7-NEXT: v_mov_b32_e32 v2, v9
; GFX7-NEXT: v_mov_b32_e32 v3, v10
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -7028,9 +6937,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
+; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s16
; GFX11-NEXT: v_mov_b32_e32 v0, s16
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
@@ -7042,7 +6949,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_add_f16 v4, v5, v2
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -7060,8 +6967,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
@@ -7072,7 +6978,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX10-NEXT: v_pk_add_f16 v4, v5, v2
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -7099,9 +7005,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -7109,7 +7014,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX908-NEXT: v_pk_add_f16 v4, v5, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -7126,9 +7031,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -7138,7 +7042,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -7156,7 +7060,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -7164,7 +7067,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-NEXT: v_mov_b32_e32 v4, s20
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -7181,7 +7084,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX7-NEXT: v_or_b32_e32 v5, v7, v0
; GFX7-NEXT: v_mov_b32_e32 v8, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
@@ -7277,9 +7180,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-NEXT: v_mov_b32_e32 v3, s16
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start
@@ -7290,7 +7191,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_mov_b32_e32 v4, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -7308,8 +7209,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
@@ -7319,7 +7219,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX10-NEXT: v_mov_b32_e32 v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -7355,9 +7255,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -7366,7 +7265,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v2
; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
@@ -7385,7 +7284,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -7393,7 +7291,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
-; GFX7-NEXT: v_mov_b32_e32 v2, s6
+; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
@@ -7410,7 +7308,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX7-NEXT: v_or_b32_e32 v4, v6, v3
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
@@ -7543,7 +7441,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -7558,7 +7455,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
-; GFX11-NEXT: ; implicit-def: $vgpr4
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB21_1
; GFX11-NEXT: ; %bb.2:
@@ -7587,7 +7483,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v4, s[4:7], 0 offen offset:1024 glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB21_4
; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
@@ -7609,7 +7505,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -7622,7 +7517,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT: ; implicit-def: $vgpr4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB21_1
@@ -7648,7 +7542,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB21_4
@@ -7697,7 +7591,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -7710,7 +7603,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT: ; implicit-def: $vgpr4
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB21_1
; GFX908-NEXT: ; %bb.2:
@@ -7735,7 +7627,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB21_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
@@ -7755,7 +7647,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -7768,7 +7659,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT: ; implicit-def: $vgpr4
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB21_1
; GFX8-NEXT: ; %bb.2:
@@ -7778,9 +7668,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB21_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v4, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v6, v8, v5
-; GFX8-NEXT: v_or_b32_e32 v7, v6, v4
+; GFX8-NEXT: v_add_f16_sdwa v6, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v7, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, v7, v6
; GFX8-NEXT: v_mov_b32_e32 v6, v7
; GFX8-NEXT: s_mov_b64 s[12:13], exec
; GFX8-NEXT: v_mov_b32_e32 v7, v8
@@ -7795,7 +7685,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB21_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
@@ -7815,7 +7705,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -7826,39 +7715,38 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB21_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v8
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v9
; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB21_4 Depth 2
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v7
; GFX7-NEXT: s_mov_b64 s[12:13], exec
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v10
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v11
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v5
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v10
+; GFX7-NEXT: v_add_f32_e32 v8, v8, v11
; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT: v_mov_b32_e32 v8, v6
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v5, v8, v5
+; GFX7-NEXT: v_mov_b32_e32 v9, v6
+; GFX7-NEXT: v_mov_b32_e32 v8, v5
; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -7870,23 +7758,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen offset:1024 glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB21_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX7-NEXT: s_mov_b64 exec, s[12:13]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v8
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v6
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB21_3
; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v0, v7
; GFX7-NEXT: v_mov_b32_e32 v1, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -8003,9 +7891,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
+; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s16
; GFX11-NEXT: v_mov_b32_e32 v0, s16
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
@@ -8017,7 +7903,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_add_f16 v4, v5, v2
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -8035,8 +7921,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start
@@ -8047,7 +7932,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX10-NEXT: v_pk_add_f16 v4, v5, v2
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -8065,16 +7950,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX90A-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -8091,9 +7975,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -8101,7 +7984,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX908-NEXT: v_pk_add_f16 v4, v5, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -8118,9 +8001,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -8130,7 +8012,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -8148,7 +8030,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -8156,7 +8037,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-NEXT: v_mov_b32_e32 v4, s20
; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -8173,7 +8054,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX7-NEXT: v_or_b32_e32 v5, v7, v0
; GFX7-NEXT: v_mov_b32_e32 v8, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
@@ -8269,9 +8150,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-NEXT: v_mov_b32_e32 v3, s16
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
@@ -8282,7 +8161,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_mov_b32_e32 v4, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -8300,8 +8179,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start
@@ -8311,7 +8189,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX10-NEXT: v_mov_b32_e32 v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -8329,15 +8207,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -8354,16 +8230,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_pk_add_f16 v1, v2, v0
; GFX908-NEXT: v_mov_b32_e32 v5, v2
; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
@@ -8380,9 +8255,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -8391,7 +8265,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v2
; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
@@ -8410,7 +8284,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -8418,7 +8291,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
-; GFX7-NEXT: v_mov_b32_e32 v2, s6
+; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
@@ -8435,7 +8308,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX7-NEXT: v_or_b32_e32 v4, v6, v3
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
@@ -8530,9 +8403,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
+; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s16
; GFX11-NEXT: v_mov_b32_e32 v0, s16
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
@@ -8544,7 +8415,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_add_f16 v4, v5, v2
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -8562,8 +8433,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start
@@ -8574,7 +8444,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX10-NEXT: v_pk_add_f16 v4, v5, v2
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -8592,16 +8462,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX90A-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -8618,9 +8487,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -8628,7 +8496,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX908-NEXT: v_pk_add_f16 v4, v5, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -8645,9 +8513,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -8657,7 +8524,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -8675,7 +8542,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -8683,7 +8549,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-NEXT: v_mov_b32_e32 v4, s20
; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -8700,7 +8566,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX7-NEXT: v_or_b32_e32 v5, v7, v0
; GFX7-NEXT: v_mov_b32_e32 v8, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
@@ -8796,9 +8662,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-NEXT: v_mov_b32_e32 v3, s16
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
@@ -8809,7 +8673,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_mov_b32_e32 v4, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -8827,8 +8691,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start
@@ -8838,7 +8701,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX10-NEXT: v_mov_b32_e32 v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -8856,15 +8719,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -8881,16 +8742,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_pk_add_f16 v1, v2, v0
; GFX908-NEXT: v_mov_b32_e32 v5, v2
; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
@@ -8907,9 +8767,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -8918,7 +8777,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v2
; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
@@ -8937,7 +8796,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -8945,7 +8803,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
-; GFX7-NEXT: v_mov_b32_e32 v2, s6
+; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
@@ -8962,7 +8820,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX7-NEXT: v_or_b32_e32 v4, v6, v3
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
@@ -9054,13 +8912,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, s16
; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s4, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[6:7], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
-; GFX942-NEXT: v_mov_b32_e32 v4, s4
+; GFX942-NEXT: v_mov_b32_e32 v4, s16
; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -9082,7 +8939,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9
; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[6:7]
-; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
@@ -9097,12 +8954,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v1
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
@@ -9131,7 +8987,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -9149,10 +9005,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v1
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
@@ -9183,7 +9038,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
@@ -9202,9 +9057,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v4, s20
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
@@ -9230,7 +9084,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: v_mov_b32_e32 v1, v6
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -9248,13 +9102,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s20
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -9275,7 +9128,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
@@ -9292,13 +9145,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: v_mov_b32_e32 v4, s4
+; GFX908-NEXT: v_mov_b32_e32 v4, s20
; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -9320,7 +9172,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9
; GFX908-NEXT: v_mov_b32_e32 v0, v5
; GFX908-NEXT: v_mov_b32_e32 v1, v6
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
@@ -9337,11 +9189,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -9366,7 +9217,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16
; GFX8-NEXT: v_mov_b32_e32 v0, v5
; GFX8-NEXT: v_mov_b32_e32 v1, v6
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
@@ -9382,7 +9233,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
@@ -9391,7 +9241,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-NEXT: v_mov_b32_e32 v4, s20
; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
@@ -9406,7 +9256,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16
; GFX7-NEXT: v_mov_b32_e32 v6, v1
; GFX7-NEXT: v_mov_b32_e32 v5, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
@@ -9488,13 +9338,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s4, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[6:7], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
-; GFX942-NEXT: v_mov_b32_e32 v4, s4
+; GFX942-NEXT: v_mov_b32_e32 v4, s16
; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -9515,7 +9364,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
@@ -9531,11 +9380,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
@@ -9561,7 +9408,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -9580,11 +9427,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
@@ -9610,7 +9455,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
@@ -9629,12 +9474,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: v_mov_b32_e32 v4, s20
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -9656,7 +9500,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v6, v1
; GFX10-NEXT: v_mov_b32_e32 v5, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -9674,13 +9518,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s20
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -9700,7 +9543,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
@@ -9717,13 +9560,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: v_mov_b32_e32 v4, s4
+; GFX908-NEXT: v_mov_b32_e32 v4, s20
; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -9744,7 +9586,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX908-NEXT: v_mov_b32_e32 v6, v1
; GFX908-NEXT: v_mov_b32_e32 v5, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
@@ -9761,11 +9603,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -9789,7 +9630,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
; GFX8-NEXT: v_mov_b32_e32 v6, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
@@ -9806,7 +9647,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
@@ -9815,7 +9655,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, s6
+; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
@@ -9830,7 +9670,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
@@ -9930,7 +9770,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX942-NEXT: s_mov_b64 s[2:3], exec
; GFX942-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -9942,40 +9781,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX942-NEXT: ; implicit-def: $vgpr4
+; GFX942-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB28_1
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
; GFX942-NEXT: s_mov_b64 s[2:3], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX942-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX942-NEXT: s_movk_i32 s10, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX942-NEXT: s_mov_b32 s11, 0x7060302
; GFX942-NEXT: .LBB28_3: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Loop Header: Depth=1
; GFX942-NEXT: ; Child Loop BB28_4 Depth 2
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v7
-; GFX942-NEXT: v_add_f32_e32 v4, v4, v9
-; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX942-NEXT: v_add3_u32 v5, v5, v4, s10
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v9
+; GFX942-NEXT: v_add_f32_e32 v6, v6, v10
+; GFX942-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX942-NEXT: v_add3_u32 v7, v7, v6, s10
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX942-NEXT: s_mov_b64 s[8:9], exec
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_add_f32_e32 v5, v5, v10
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s10
-; GFX942-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
+; GFX942-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX942-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX942-NEXT: v_add3_u32 v8, v8, v7, s10
+; GFX942-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
-; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
+; GFX942-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc
+; GFX942-NEXT: v_perm_b32 v8, v7, v6, s11
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
; GFX942-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -9988,27 +9826,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen offset:1024 sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB28_4
; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
; GFX942-NEXT: s_mov_b64 exec, s[8:9]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v6
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB28_3
; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
@@ -10022,8 +9859,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v4, s[4:7], 0 offen offset:1024
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1
; GFX11-TRUE16-NEXT: ; %bb.2:
@@ -10036,28 +9872,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-TRUE16-NEXT: ; Child Loop BB28_4 Depth 2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v8 :: v_dual_add_f32 v4, v4, v9
-; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, v6, v8 :: v_dual_add_f32 v5, v5, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v6, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7
; GFX11-TRUE16-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -10071,14 +9907,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], 0 offen offset:1024 glc
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_4
; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v5
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -10088,13 +9924,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v5
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-FAKE16-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
@@ -10108,8 +9943,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT: buffer_load_b32 v7, v4, s[4:7], 0 offen offset:1024
; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_1
; GFX11-FAKE16-NEXT: ; %bb.2:
@@ -10122,28 +9956,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-FAKE16-NEXT: ; Child Loop BB28_4 Depth 2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8
-; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, v6, v9 :: v_dual_add_f32 v5, v5, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v6, 16, 1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v5, 0x7060302
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v7
; GFX11-FAKE16-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -10157,14 +9991,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], 0 offen offset:1024 glc
; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_4
; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v5
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -10174,13 +10008,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v5
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
@@ -10192,8 +10025,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB28_1
@@ -10205,25 +10037,25 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB28_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v4, v4, v8
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
-; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v8
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v9
+; GFX10-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v4, v5
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6
+; GFX10-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v6, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo
+; GFX10-NEXT: v_perm_b32 v6, v6, v5, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v5, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v7
; GFX10-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
@@ -10235,15 +10067,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB28_4
; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
@@ -10252,13 +10084,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX10-NEXT: s_cbranch_execnz .LBB28_3
; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -10270,38 +10101,37 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX90A-NEXT: s_movk_i32 s14, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX90A-NEXT: s_mov_b32 s15, 0x7060302
; GFX90A-NEXT: .LBB28_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB28_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7
-; GFX90A-NEXT: v_add_f32_e32 v4, v4, v9
-; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v10
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14
-; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v9
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v10
+; GFX90A-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX90A-NEXT: v_add3_u32 v7, v7, v6, s14
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
+; GFX90A-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX90A-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX90A-NEXT: v_add3_u32 v8, v8, v7, s14
+; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc
+; GFX90A-NEXT: v_perm_b32 v8, v7, v6, s15
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
; GFX90A-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -10313,27 +10143,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -10345,8 +10174,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB28_1
; GFX908-NEXT: ; %bb.2:
@@ -10360,24 +10188,24 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB28_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX908-NEXT: v_add_f32_e32 v4, v4, v8
-; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14
-; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX908-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14
-; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v8
+; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX908-NEXT: v_add3_u32 v6, v6, v5, s14
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
-; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
-; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v6, v10, vcc
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v9
+; GFX908-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX908-NEXT: v_add3_u32 v10, v10, v6, s14
+; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc
+; GFX908-NEXT: v_perm_b32 v6, v6, v5, s15
; GFX908-NEXT: v_mov_b32_e32 v5, v6
+; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v6, v7
; GFX908-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -10389,27 +10217,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB28_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB28_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v0, v5
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -10421,8 +10248,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB28_1
; GFX8-NEXT: ; %bb.2:
@@ -10434,27 +10260,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB28_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT: v_add_f32_e32 v4, v4, v8
-; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
-; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v8
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
-; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v9
+; GFX8-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v6
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
+; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v6, v6, v5, 16
; GFX8-NEXT: v_mov_b32_e32 v5, v6
+; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v6, v7
; GFX8-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -10466,27 +10292,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB28_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB28_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v0, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -10497,36 +10322,35 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB28_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v5
; GFX7-NEXT: .LBB28_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB28_4 Depth 2
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v9
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_add_f32_e32 v8, v8, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v8, v5, 16
+; GFX7-NEXT: v_mov_b32_e32 v9, v6
; GFX7-NEXT: s_mov_b64 s[12:13], exec
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v8, v5
; GFX7-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -10538,23 +10362,23 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen offset:1024 glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB28_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
; GFX7-NEXT: s_mov_b64 exec, s[12:13]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v6
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB28_3
; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v7
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: v_mov_b32_e32 v0, v8
+; GFX7-NEXT: v_mov_b32_e32 v1, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -10658,13 +10482,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, s16
; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s4, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[6:7], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
-; GFX942-NEXT: v_mov_b32_e32 v4, s4
+; GFX942-NEXT: v_mov_b32_e32 v4, s16
; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -10686,7 +10509,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9
; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[6:7]
-; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
@@ -10701,12 +10524,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v1
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start
@@ -10735,7 +10557,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -10753,10 +10575,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v1
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
@@ -10787,7 +10608,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
@@ -10806,9 +10627,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v4, s20
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
@@ -10834,7 +10654,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: v_mov_b32_e32 v1, v6
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -10852,13 +10672,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s20
; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -10879,7 +10698,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
@@ -10896,13 +10715,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: v_mov_b32_e32 v4, s4
+; GFX908-NEXT: v_mov_b32_e32 v4, s20
; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -10924,7 +10742,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9
; GFX908-NEXT: v_mov_b32_e32 v0, v5
; GFX908-NEXT: v_mov_b32_e32 v1, v6
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
@@ -10941,11 +10759,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -10970,7 +10787,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16
; GFX8-NEXT: v_mov_b32_e32 v0, v5
; GFX8-NEXT: v_mov_b32_e32 v1, v6
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
@@ -10986,7 +10803,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
@@ -10995,7 +10811,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-NEXT: v_mov_b32_e32 v4, s20
; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
@@ -11010,7 +10826,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16
; GFX7-NEXT: v_mov_b32_e32 v6, v1
; GFX7-NEXT: v_mov_b32_e32 v5, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
@@ -11092,13 +10908,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s4, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[6:7], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
-; GFX942-NEXT: v_mov_b32_e32 v4, s4
+; GFX942-NEXT: v_mov_b32_e32 v4, s16
; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -11119,7 +10934,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
@@ -11135,11 +10950,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start
@@ -11165,7 +10978,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -11184,11 +10997,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start
@@ -11214,7 +11025,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
@@ -11233,12 +11044,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: v_mov_b32_e32 v4, s20
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -11260,7 +11070,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v6, v1
; GFX10-NEXT: v_mov_b32_e32 v5, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -11278,13 +11088,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s20
; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -11304,7 +11113,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
@@ -11321,13 +11130,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: v_mov_b32_e32 v4, s4
+; GFX908-NEXT: v_mov_b32_e32 v4, s20
; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -11348,7 +11156,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX908-NEXT: v_mov_b32_e32 v6, v1
; GFX908-NEXT: v_mov_b32_e32 v5, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
@@ -11365,11 +11173,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -11393,7 +11200,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
; GFX8-NEXT: v_mov_b32_e32 v6, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
@@ -11410,7 +11217,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
@@ -11419,7 +11225,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, s6
+; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
@@ -11434,7 +11240,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
@@ -11517,13 +11323,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, s16
; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s4, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[6:7], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
-; GFX942-NEXT: v_mov_b32_e32 v4, s4
+; GFX942-NEXT: v_mov_b32_e32 v4, s16
; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -11545,7 +11350,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9
; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[6:7]
-; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
@@ -11560,12 +11365,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v1
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start
@@ -11594,7 +11398,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -11612,10 +11416,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v1
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
@@ -11646,7 +11449,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
@@ -11665,9 +11468,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v4, s20
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
@@ -11693,7 +11495,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: v_mov_b32_e32 v1, v6
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -11711,13 +11513,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s20
; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -11738,7 +11539,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
@@ -11755,13 +11556,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: v_mov_b32_e32 v4, s4
+; GFX908-NEXT: v_mov_b32_e32 v4, s20
; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -11783,7 +11583,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9
; GFX908-NEXT: v_mov_b32_e32 v0, v5
; GFX908-NEXT: v_mov_b32_e32 v1, v6
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
@@ -11800,11 +11600,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -11829,7 +11628,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16
; GFX8-NEXT: v_mov_b32_e32 v0, v5
; GFX8-NEXT: v_mov_b32_e32 v1, v6
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
@@ -11845,7 +11644,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
@@ -11854,7 +11652,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-NEXT: v_mov_b32_e32 v4, s20
; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
@@ -11869,7 +11667,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16
; GFX7-NEXT: v_mov_b32_e32 v6, v1
; GFX7-NEXT: v_mov_b32_e32 v5, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
@@ -11951,13 +11749,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s4, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[6:7], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
-; GFX942-NEXT: v_mov_b32_e32 v4, s4
+; GFX942-NEXT: v_mov_b32_e32 v4, s16
; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -11978,7 +11775,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
@@ -11994,11 +11791,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start
@@ -12024,7 +11819,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -12043,11 +11838,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start
@@ -12073,7 +11866,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
@@ -12092,12 +11885,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: v_mov_b32_e32 v4, s20
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -12119,7 +11911,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v6, v1
; GFX10-NEXT: v_mov_b32_e32 v5, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -12137,13 +11929,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s20
; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -12163,7 +11954,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
@@ -12180,13 +11971,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: v_mov_b32_e32 v4, s4
+; GFX908-NEXT: v_mov_b32_e32 v4, s20
; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -12207,7 +11997,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX908-NEXT: v_mov_b32_e32 v6, v1
; GFX908-NEXT: v_mov_b32_e32 v5, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
@@ -12224,11 +12014,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -12252,7 +12041,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
; GFX8-NEXT: v_mov_b32_e32 v6, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
@@ -12269,7 +12058,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
@@ -12278,7 +12066,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, s6
+; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
@@ -12293,7 +12081,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
@@ -12375,13 +12163,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s4, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[6:7], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
-; GFX942-NEXT: v_mov_b32_e32 v4, s4
+; GFX942-NEXT: v_mov_b32_e32 v4, s16
; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -12402,7 +12189,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
@@ -12418,11 +12205,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start
@@ -12448,7 +12233,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -12467,11 +12252,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start
@@ -12497,7 +12280,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
@@ -12516,12 +12299,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: v_mov_b32_e32 v4, s20
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -12543,7 +12325,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v6, v1
; GFX10-NEXT: v_mov_b32_e32 v5, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -12561,13 +12343,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s20
; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -12587,7 +12368,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
@@ -12604,13 +12385,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: v_mov_b32_e32 v4, s4
+; GFX908-NEXT: v_mov_b32_e32 v4, s20
; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -12631,7 +12411,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX908-NEXT: v_mov_b32_e32 v6, v1
; GFX908-NEXT: v_mov_b32_e32 v5, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
@@ -12648,11 +12428,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -12676,7 +12455,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
; GFX8-NEXT: v_mov_b32_e32 v6, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
@@ -12693,7 +12472,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
@@ -12702,7 +12480,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, s6
+; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
@@ -12717,7 +12495,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
@@ -12825,8 +12603,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start
@@ -12837,7 +12614,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -12855,9 +12632,8 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
; GFX90A-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -12865,7 +12641,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
@@ -12883,9 +12659,8 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -12893,7 +12668,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -12910,9 +12685,8 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -12920,7 +12694,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: v_add_f32_e32 v4, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -12937,9 +12711,8 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s20
; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s6
+; GFX7-NEXT: v_mov_b32_e32 v3, s20
; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -12947,7 +12720,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
; GFX7-NEXT: v_mov_b32_e32 v0, v4
; GFX7-NEXT: v_mov_b32_e32 v1, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index f7a1fb3..316ba85 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -37,10 +37,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, s16
; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s6, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[4:5], 0
; GFX942-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, s6
+; GFX942-NEXT: v_mov_b32_e32 v3, s16
; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -49,7 +48,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX942-NEXT: v_max_f32_e32 v4, v0, v2
; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -88,10 +87,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -99,7 +97,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5
; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -116,10 +114,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -128,7 +125,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX908-NEXT: v_max_f32_e32 v4, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -145,10 +142,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -157,7 +153,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX8-NEXT: v_max_f32_e32 v4, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -212,10 +208,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s6, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[4:5], 0
; GFX942-NEXT: v_max_f32_e32 v2, v0, v0
-; GFX942-NEXT: v_mov_b32_e32 v3, s6
+; GFX942-NEXT: v_mov_b32_e32 v3, s16
; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -223,7 +218,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
; GFX942-NEXT: v_max_f32_e32 v0, v0, v2
; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -262,17 +257,16 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -289,10 +283,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v0, v0
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -300,7 +293,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
; GFX908-NEXT: v_max_f32_e32 v0, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v5, v1
; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -317,10 +310,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -328,7 +320,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -402,7 +394,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX942-NEXT: s_mov_b64 s[2:3], exec
; GFX942-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -414,22 +405,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX942-NEXT: ; implicit-def: $vgpr4
+; GFX942-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB2_1
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
; GFX942-NEXT: s_mov_b64 s[2:3], 0
-; GFX942-NEXT: v_max_f32_e32 v9, v5, v5
+; GFX942-NEXT: v_max_f32_e32 v5, v5, v5
; GFX942-NEXT: .LBB2_3: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Loop Header: Depth=1
; GFX942-NEXT: ; Child Loop BB2_4 Depth 2
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_max_f32_e32 v4, v7, v7
-; GFX942-NEXT: v_max_f32_e32 v6, v4, v9
+; GFX942-NEXT: v_max_f32_e32 v6, v9, v9
+; GFX942-NEXT: v_max_f32_e32 v8, v6, v5
; GFX942-NEXT: s_mov_b64 s[8:9], exec
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
@@ -443,21 +433,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen offset:1024 sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB2_4
; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
; GFX942-NEXT: s_mov_b64 exec, s[8:9]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v6
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB2_3
; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -522,7 +512,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -534,22 +523,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB2_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_max_f32_e32 v9, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v5, v5, v5
; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB2_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7
-; GFX90A-NEXT: v_max_f32_e32 v6, v4, v9
+; GFX90A-NEXT: v_max_f32_e32 v6, v9, v9
+; GFX90A-NEXT: v_max_f32_e32 v8, v6, v5
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -561,27 +549,26 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB2_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB2_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -593,8 +580,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB2_1
; GFX908-NEXT: ; %bb.2:
@@ -605,11 +591,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB2_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v4, v6, v6
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v8
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
-; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_max_f32_e32 v5, v7, v7
+; GFX908-NEXT: v_max_f32_e32 v6, v5, v8
; GFX908-NEXT: v_mov_b32_e32 v5, v6
+; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v6, v7
; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -621,27 +607,26 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB2_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB2_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v0, v5
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -653,8 +638,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB2_1
; GFX8-NEXT: ; %bb.2:
@@ -665,11 +649,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB2_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v6
-; GFX8-NEXT: v_max_f32_e32 v5, v4, v8
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
-; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v7
+; GFX8-NEXT: v_max_f32_e32 v6, v5, v8
; GFX8-NEXT: v_mov_b32_e32 v5, v6
+; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v6, v7
; GFX8-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -681,21 +665,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB2_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB2_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v0, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -777,10 +761,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, s16
; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s6, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[4:5], 0
; GFX942-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, s6
+; GFX942-NEXT: v_mov_b32_e32 v3, s16
; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -789,7 +772,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX942-NEXT: v_max_f32_e32 v4, v0, v2
; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -804,11 +787,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, s16 :: v_dual_max_f32 v2, v1, v1
+; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -819,7 +801,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX11-NEXT: v_max_f32_e32 v4, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -837,11 +819,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_max_f32_e32 v2, v1, v1
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -851,7 +832,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX10-NEXT: v_max_f32_e32 v4, v0, v2
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -869,10 +850,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -880,7 +860,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5
; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -897,10 +877,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -909,7 +888,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX908-NEXT: v_max_f32_e32 v4, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -926,10 +905,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -938,7 +916,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX8-NEXT: v_max_f32_e32 v4, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -955,10 +933,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX7-NEXT: v_mov_b32_e32 v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s20
; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, s6
+; GFX7-NEXT: v_mov_b32_e32 v3, s20
; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -967,7 +944,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX7-NEXT: v_max_f32_e32 v4, v0, v2
; GFX7-NEXT: v_mov_b32_e32 v0, v4
; GFX7-NEXT: v_mov_b32_e32 v1, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1035,10 +1012,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, s16
; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s6, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[4:5], 0
; GFX942-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, s6
+; GFX942-NEXT: v_mov_b32_e32 v3, s16
; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -1047,7 +1023,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX942-NEXT: v_max_f32_e32 v4, v0, v2
; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1086,10 +1062,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -1097,7 +1072,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5
; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1114,10 +1089,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -1126,7 +1100,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX908-NEXT: v_max_f32_e32 v4, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1143,10 +1117,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1155,7 +1128,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX8-NEXT: v_max_f32_e32 v4, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1203,12 +1176,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: v_mov_b32_e32 v6, s16
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1220,7 +1192,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
@@ -1248,12 +1220,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x800
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: v_mov_b32_e32 v6, s16
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1265,7 +1236,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -1306,9 +1277,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v6, s6
+; GFX908-NEXT: v_mov_b32_e32 v6, s20
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -1320,7 +1290,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -1339,9 +1309,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: v_mov_b32_e32 v6, s20
; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1353,7 +1322,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -1397,11 +1366,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, s16
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v6, s4
-; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
+; GFX12-NEXT: v_mov_b32_e32 v6, s16
; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1411,7 +1378,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
@@ -1440,11 +1407,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, s16
; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX11-NEXT: s_add_i32 s4, s16, 0x800
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v6, s4
-; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
+; GFX11-NEXT: v_mov_b32_e32 v6, s16
; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1454,7 +1419,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen offset:2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -1494,9 +1459,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v2, s20
; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v6, s6
+; GFX908-NEXT: v_mov_b32_e32 v6, s20
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -1506,7 +1470,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v9, v2
; GFX908-NEXT: v_mov_b32_e32 v8, v1
; GFX908-NEXT: v_mov_b32_e32 v7, v0
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
@@ -1525,9 +1489,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v2, s20
; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: v_mov_b32_e32 v6, s20
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1537,7 +1500,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v9, v2
; GFX8-NEXT: v_mov_b32_e32 v8, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v0
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
@@ -1583,10 +1546,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4
; GFX12-NEXT: s_mov_b32 s1, exec_lo
; GFX12-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_readfirstlane_b32 s4, v9
; GFX12-NEXT: v_readfirstlane_b32 s5, v10
; GFX12-NEXT: v_readfirstlane_b32 s6, v7
@@ -1600,12 +1562,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
-; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
+; GFX12-NEXT: v_max_num_f64_e32 v[5:6], v[5:6], v[5:6]
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Loop Header: Depth=1
@@ -1615,7 +1576,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[5:6]
; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1
@@ -1632,7 +1593,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v4, s[4:7], null offen offset:2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -1686,27 +1647,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s4, v9
; GFX11-NEXT: v_readfirstlane_b32 s5, v10
; GFX11-NEXT: v_readfirstlane_b32 s6, v7
; GFX11-NEXT: v_readfirstlane_b32 s7, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
-; GFX11-NEXT: ; implicit-def: $vgpr4
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB7_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX11-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
@@ -1716,7 +1676,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5]
+; GFX11-NEXT: v_max_f64 v[11:12], v[0:1], v[5:6]
; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1
@@ -1732,7 +1692,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v4, s[4:7], 0 offen offset:2048 glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB7_4
; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -1816,7 +1776,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX908-NEXT: v_mov_b32_e32 v7, v2
; GFX908-NEXT: v_mov_b32_e32 v10, v1
; GFX908-NEXT: v_mov_b32_e32 v9, v0
-; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v9
@@ -1829,12 +1788,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX908-NEXT: ; implicit-def: $vgpr4
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX908-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
@@ -1842,7 +1800,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14]
; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5]
+; GFX908-NEXT: v_max_f64 v[11:12], v[0:1], v[5:6]
; GFX908-NEXT: v_mov_b32_e32 v0, v11
; GFX908-NEXT: v_mov_b32_e32 v1, v12
; GFX908-NEXT: v_mov_b32_e32 v2, v13
@@ -1858,7 +1816,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v4, s[8:11], 0 offen offset:2048 glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -1882,7 +1840,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX8-NEXT: v_mov_b32_e32 v7, v2
; GFX8-NEXT: v_mov_b32_e32 v10, v1
; GFX8-NEXT: v_mov_b32_e32 v9, v0
-; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v9
@@ -1895,12 +1852,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX8-NEXT: ; implicit-def: $vgpr4
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB7_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX8-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
@@ -1908,7 +1864,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14]
; GFX8-NEXT: s_mov_b64 s[12:13], exec
-; GFX8-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5]
+; GFX8-NEXT: v_max_f64 v[11:12], v[0:1], v[5:6]
; GFX8-NEXT: v_mov_b32_e32 v0, v11
; GFX8-NEXT: v_mov_b32_e32 v1, v12
; GFX8-NEXT: v_mov_b32_e32 v2, v13
@@ -1924,7 +1880,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v4, s[8:11], 0 offen offset:2048 glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB7_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -2010,12 +1966,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: v_mov_b32_e32 v6, s16
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2027,7 +1982,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
@@ -2055,12 +2010,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x800
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: v_mov_b32_e32 v6, s16
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2072,7 +2026,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -2091,11 +2045,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: s_add_i32 s4, s20, 0x800
-; GFX10-NEXT: v_mov_b32_e32 v6, s4
+; GFX10-NEXT: v_mov_b32_e32 v6, s20
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2108,7 +2061,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: v_mov_b32_e32 v2, v9
; GFX10-NEXT: v_mov_b32_e32 v3, v10
-; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -2127,10 +2080,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x800
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT: v_mov_b32_e32 v6, s6
+; GFX90A-NEXT: v_mov_b32_e32 v6, s20
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -2139,7 +2091,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX90A-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
@@ -2158,9 +2110,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v6, s6
+; GFX908-NEXT: v_mov_b32_e32 v6, s20
; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -2172,7 +2123,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2191,9 +2142,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: v_mov_b32_e32 v6, s20
; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -2205,7 +2155,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2223,10 +2173,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX7-NEXT: v_mov_b32_e32 v0, s20
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX7-NEXT: s_add_i32 s6, s20, 0x800
; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
+; GFX7-NEXT: v_mov_b32_e32 v6, s20
; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -2238,7 +2187,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX7-NEXT: v_mov_b32_e32 v1, v8
; GFX7-NEXT: v_mov_b32_e32 v2, v9
; GFX7-NEXT: v_mov_b32_e32 v3, v10
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2298,12 +2247,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: v_mov_b32_e32 v6, s16
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2315,7 +2263,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
@@ -2343,12 +2291,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x800
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: v_mov_b32_e32 v6, s16
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2360,7 +2307,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -2401,9 +2348,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v6, s6
+; GFX908-NEXT: v_mov_b32_e32 v6, s20
; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -2415,7 +2361,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2434,9 +2380,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: v_mov_b32_e32 v6, s20
; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -2448,7 +2393,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -6146,13 +6091,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v3, s4
+; GFX12-NEXT: v_mov_b32_e32 v3, s16
+; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1
; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6163,7 +6106,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX12-NEXT: v_pk_max_num_f16 v4, v0, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
@@ -6182,10 +6125,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, s16
; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s6, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[4:5], 0
; GFX942-NEXT: v_pk_max_f16 v2, v1, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, s6
+; GFX942-NEXT: v_mov_b32_e32 v3, s16
; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -6195,7 +6137,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX942-NEXT: v_pk_max_f16 v4, v0, v2
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
-; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -6210,12 +6152,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-NEXT: v_mov_b32_e32 v3, s16
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_pk_max_f16 v2, v1, v1
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -6226,7 +6167,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX11-NEXT: v_pk_max_f16 v4, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -6244,11 +6185,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_pk_max_f16 v2, v1, v1
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -6258,7 +6198,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX10-NEXT: v_pk_max_f16 v4, v0, v2
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -6276,10 +6216,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6287,7 +6226,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5
; GFX90A-NEXT: v_pk_max_f16 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -6304,10 +6243,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_pk_max_f16 v2, v1, v1
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -6316,7 +6254,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX908-NEXT: v_pk_max_f16 v4, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -6333,11 +6271,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v1, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -6349,7 +6286,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX8-NEXT: v_or_b32_e32 v5, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v5
; GFX8-NEXT: v_mov_b32_e32 v1, v6
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
@@ -6367,7 +6304,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -6375,7 +6311,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-NEXT: v_mov_b32_e32 v4, s20
; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -6392,7 +6328,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX7-NEXT: v_or_b32_e32 v5, v7, v0
; GFX7-NEXT: v_mov_b32_e32 v8, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
@@ -6467,10 +6403,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s16
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v3, s4
+; GFX12-NEXT: v_mov_b32_e32 v3, s16
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
@@ -6481,7 +6415,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
@@ -6500,10 +6434,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s6, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[4:5], 0
; GFX942-NEXT: v_pk_max_f16 v2, v0, v0
-; GFX942-NEXT: v_mov_b32_e32 v3, s6
+; GFX942-NEXT: v_mov_b32_e32 v3, s16
; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -6512,7 +6445,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX942-NEXT: v_pk_max_f16 v0, v0, v2
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -6528,9 +6461,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-NEXT: v_pk_max_f16 v2, v0, v0
-; GFX11-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-NEXT: v_mov_b32_e32 v3, s16
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
@@ -6541,7 +6473,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, v0, v2
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -6559,9 +6491,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
; GFX10-NEXT: v_pk_max_f16 v2, v0, v0
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
@@ -6572,7 +6503,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX10-NEXT: v_pk_max_f16 v0, v0, v2
; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -6590,17 +6521,16 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -6617,10 +6547,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_pk_max_f16 v2, v0, v0
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -6628,7 +6557,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX908-NEXT: v_pk_max_f16 v0, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v5, v1
; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -6645,11 +6574,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -6660,7 +6588,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: v_mov_b32_e32 v6, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
@@ -6679,7 +6607,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -6687,7 +6614,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
-; GFX7-NEXT: v_mov_b32_e32 v2, s6
+; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
@@ -6704,7 +6631,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX7-NEXT: v_or_b32_e32 v4, v6, v3
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
@@ -6778,7 +6705,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX12-NEXT: s_mov_b32 s1, exec_lo
; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -6793,8 +6719,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
-; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: buffer_load_b32 v7, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2:
@@ -6805,13 +6730,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: ; =>This Loop Header: Depth=1
; GFX12-NEXT: ; Child Loop BB18_4 Depth 2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6
+; GFX12-NEXT: v_pk_max_num_f16 v5, v7, v7
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v8
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-NEXT: v_pk_max_num_f16 v6, v5, v8
; GFX12-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-NEXT: v_mov_b32_e32 v6, v7
; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -6826,14 +6751,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX12-NEXT: v_mov_b32_e32 v7, v5
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6841,14 +6766,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_cbranch_execnz .LBB18_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-NEXT: v_mov_b32_e32 v0, v5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX942-NEXT: s_mov_b64 s[2:3], exec
; GFX942-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -6860,23 +6784,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX942-NEXT: ; implicit-def: $vgpr4
+; GFX942-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB18_1
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
; GFX942-NEXT: s_mov_b64 s[2:3], 0
-; GFX942-NEXT: v_pk_max_f16 v9, v5, v5
+; GFX942-NEXT: v_pk_max_f16 v5, v5, v5
; GFX942-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Loop Header: Depth=1
; GFX942-NEXT: ; Child Loop BB18_4 Depth 2
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_pk_max_f16 v4, v7, v7
+; GFX942-NEXT: v_pk_max_f16 v6, v9, v9
; GFX942-NEXT: s_mov_b64 s[8:9], exec
-; GFX942-NEXT: v_pk_max_f16 v6, v4, v9
+; GFX942-NEXT: v_pk_max_f16 v8, v6, v5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -6889,27 +6812,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen offset:1024 sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB18_4
; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX942-NEXT: s_mov_b64 exec, s[8:9]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v6
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB18_3
; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
@@ -6923,8 +6845,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: buffer_load_b32 v7, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB18_1
; GFX11-NEXT: ; %bb.2:
@@ -6935,13 +6856,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB18_4 Depth 2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v4, v6, v6
+; GFX11-NEXT: v_pk_max_f16 v5, v7, v7
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v8
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-NEXT: v_pk_max_f16 v6, v5, v8
; GFX11-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-NEXT: v_mov_b32_e32 v6, v7
; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -6955,14 +6876,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], 0 offen offset:1024 glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB18_4
; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-NEXT: v_mov_b32_e32 v7, v5
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -6971,13 +6892,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_cbranch_execnz .LBB18_3
; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v4
+; GFX11-NEXT: v_mov_b32_e32 v0, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
@@ -6989,8 +6909,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB18_1
@@ -7001,12 +6920,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB18_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v4, v6, v6
+; GFX10-NEXT: v_pk_max_f16 v5, v7, v7
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v8
-; GFX10-NEXT: v_mov_b32_e32 v4, v5
+; GFX10-NEXT: v_pk_max_f16 v6, v5, v8
; GFX10-NEXT: v_mov_b32_e32 v5, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v7
; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
@@ -7018,15 +6937,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB18_4
; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
@@ -7035,13 +6954,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX10-NEXT: s_cbranch_execnz .LBB18_3
; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -7053,22 +6971,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5
+; GFX90A-NEXT: v_pk_max_f16 v5, v5, v5
; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7
-; GFX90A-NEXT: v_pk_max_f16 v6, v4, v9
+; GFX90A-NEXT: v_pk_max_f16 v6, v9, v9
+; GFX90A-NEXT: v_pk_max_f16 v8, v6, v5
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -7080,27 +6997,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -7112,8 +7028,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB18_1
; GFX908-NEXT: ; %bb.2:
@@ -7124,11 +7039,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB18_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v4, v6, v6
-; GFX908-NEXT: v_pk_max_f16 v5, v4, v8
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
-; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_pk_max_f16 v5, v7, v7
+; GFX908-NEXT: v_pk_max_f16 v6, v5, v8
; GFX908-NEXT: v_mov_b32_e32 v5, v6
+; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v6, v7
; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -7140,27 +7055,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB18_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB18_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v0, v5
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -7172,8 +7086,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB18_1
; GFX8-NEXT: ; %bb.2:
@@ -7185,14 +7098,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB18_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
-; GFX8-NEXT: v_max_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v5, v5, v9
-; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
-; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_max_f16_sdwa v5, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v6, v7, v7
+; GFX8-NEXT: v_max_f16_sdwa v5, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v6, v6, v9
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v5
; GFX8-NEXT: v_mov_b32_e32 v5, v6
+; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v6, v7
; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -7204,27 +7117,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB18_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB18_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v0, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -7235,39 +7147,38 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB18_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v8
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v9
; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB18_4 Depth 2
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v7
; GFX7-NEXT: s_mov_b64 s[12:13], exec
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v10
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v11
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v5
+; GFX7-NEXT: v_max_f32_e32 v7, v7, v10
+; GFX7-NEXT: v_max_f32_e32 v8, v8, v11
; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT: v_mov_b32_e32 v8, v6
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v5, v8, v5
+; GFX7-NEXT: v_mov_b32_e32 v9, v6
+; GFX7-NEXT: v_mov_b32_e32 v8, v5
; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -7279,23 +7190,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen offset:1024 glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB18_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX7-NEXT: s_mov_b64 exec, s[12:13]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v8
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v6
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB18_3
; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v0, v7
; GFX7-NEXT: v_mov_b32_e32 v1, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -7396,13 +7307,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v1
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@@ -7431,7 +7340,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
@@ -7452,11 +7361,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400
; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v1
; GFX12-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
@@ -7487,7 +7394,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX12-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
@@ -7506,13 +7413,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, s16
; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s4, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[6:7], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
-; GFX942-NEXT: v_mov_b32_e32 v4, s4
+; GFX942-NEXT: v_mov_b32_e32 v4, s16
; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -7534,7 +7440,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9
; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[6:7]
-; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
@@ -7549,12 +7455,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v1
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
@@ -7583,7 +7488,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -7601,10 +7506,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v1
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
@@ -7635,7 +7539,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
@@ -7654,9 +7558,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v4, s20
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
@@ -7682,7 +7585,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: v_mov_b32_e32 v1, v6
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -7700,13 +7603,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s20
; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -7727,7 +7629,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
@@ -7744,13 +7646,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: v_mov_b32_e32 v4, s4
+; GFX908-NEXT: v_mov_b32_e32 v4, s20
; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -7772,7 +7673,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9
; GFX908-NEXT: v_mov_b32_e32 v0, v5
; GFX908-NEXT: v_mov_b32_e32 v1, v6
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
@@ -7789,11 +7690,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -7818,7 +7718,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16
; GFX8-NEXT: v_mov_b32_e32 v0, v5
; GFX8-NEXT: v_mov_b32_e32 v1, v6
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
@@ -7834,7 +7734,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
@@ -7843,7 +7742,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-NEXT: v_mov_b32_e32 v4, s20
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
@@ -7858,7 +7757,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
; GFX7-NEXT: v_mov_b32_e32 v6, v1
; GFX7-NEXT: v_mov_b32_e32 v5, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
@@ -7928,11 +7827,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@@ -7958,7 +7855,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
@@ -7980,11 +7877,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v0
; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
@@ -8010,7 +7905,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
@@ -8029,13 +7924,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s4, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[6:7], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
-; GFX942-NEXT: v_mov_b32_e32 v4, s4
+; GFX942-NEXT: v_mov_b32_e32 v4, s16
; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -8056,7 +7950,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
@@ -8072,11 +7966,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start
@@ -8102,7 +7994,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -8121,11 +8013,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start
@@ -8151,7 +8041,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
@@ -8170,12 +8060,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: v_mov_b32_e32 v4, s20
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -8197,7 +8086,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v6, v1
; GFX10-NEXT: v_mov_b32_e32 v5, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -8215,13 +8104,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s20
; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -8241,7 +8129,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
@@ -8258,13 +8146,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: v_mov_b32_e32 v4, s4
+; GFX908-NEXT: v_mov_b32_e32 v4, s20
; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -8285,7 +8172,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX908-NEXT: v_mov_b32_e32 v6, v1
; GFX908-NEXT: v_mov_b32_e32 v5, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
@@ -8302,11 +8189,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -8330,7 +8216,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
; GFX8-NEXT: v_mov_b32_e32 v6, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
@@ -8347,7 +8233,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
@@ -8356,7 +8241,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, s6
+; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
@@ -8371,7 +8256,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
@@ -8440,7 +8325,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo
; GFX12-TRUE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -8455,8 +8339,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
-; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4
+; GFX12-TRUE16-NEXT: buffer_load_b32 v7, v4, s[4:7], null offen offset:1024
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-TRUE16-NEXT: ; %bb.2:
@@ -8468,30 +8351,30 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX12-TRUE16-NEXT: ; Child Loop BB21_4 Depth 2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v8 :: v_dual_max_num_f32 v4, v4, v9
-; GFX12-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v6, v6, v8 :: v_dual_max_num_f32 v5, v5, v9
+; GFX12-TRUE16-NEXT: v_bfe_u32 v11, v6, 16, 1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX12-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX12-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v11, v11, v6, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7
; GFX12-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -8506,14 +8389,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_4
; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v5
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -8521,7 +8404,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_3
; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v5
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -8532,7 +8415,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo
; GFX12-FAKE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -8547,8 +8429,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
-; GFX12-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX12-FAKE16-NEXT: buffer_load_b32 v7, v4, s[4:7], null offen offset:1024
; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-FAKE16-NEXT: ; %bb.2:
@@ -8560,30 +8441,30 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1
; GFX12-FAKE16-NEXT: ; Child Loop BB21_4 Depth 2
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v9 :: v_dual_max_num_f32 v4, v4, v8
-; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v6, v6, v9 :: v_dual_max_num_f32 v5, v5, v8
+; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v6, 16, 1
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX12-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX12-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v11, v11, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v6, v6, v5, 0x7060302
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v7
; GFX12-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -8598,14 +8479,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_4
; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v7, v5
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -8613,14 +8494,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_3
; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v5
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX942-NEXT: s_mov_b64 s[2:3], exec
; GFX942-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -8632,40 +8512,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX942-NEXT: ; implicit-def: $vgpr4
+; GFX942-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB21_1
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
; GFX942-NEXT: s_mov_b64 s[2:3], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX942-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX942-NEXT: s_movk_i32 s10, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX942-NEXT: s_mov_b32 s11, 0x7060302
; GFX942-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Loop Header: Depth=1
; GFX942-NEXT: ; Child Loop BB21_4 Depth 2
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v7
-; GFX942-NEXT: v_max_f32_e32 v4, v4, v9
-; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX942-NEXT: v_add3_u32 v5, v5, v4, s10
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v9
+; GFX942-NEXT: v_max_f32_e32 v6, v6, v10
+; GFX942-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX942-NEXT: v_add3_u32 v7, v7, v6, s10
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX942-NEXT: s_mov_b64 s[8:9], exec
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_max_f32_e32 v5, v5, v10
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s10
-; GFX942-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
+; GFX942-NEXT: v_max_f32_e32 v7, v7, v5
+; GFX942-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX942-NEXT: v_add3_u32 v8, v8, v7, s10
+; GFX942-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
-; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
+; GFX942-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc
+; GFX942-NEXT: v_perm_b32 v8, v7, v6, s11
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
; GFX942-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -8678,27 +8557,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen offset:1024 sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB21_4
; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX942-NEXT: s_mov_b64 exec, s[8:9]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v6
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB21_3
; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -8712,8 +8590,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v4, s[4:7], 0 offen offset:1024
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_1
; GFX11-TRUE16-NEXT: ; %bb.2:
@@ -8726,28 +8603,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-TRUE16-NEXT: ; Child Loop BB21_4 Depth 2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v8 :: v_dual_max_f32 v4, v4, v9
-; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v6, v6, v8 :: v_dual_max_f32 v5, v5, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v6, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7
; GFX11-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -8761,14 +8638,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], 0 offen offset:1024 glc
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_4
; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v5
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -8778,13 +8655,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v5
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-FAKE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -8798,8 +8674,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT: buffer_load_b32 v7, v4, s[4:7], 0 offen offset:1024
; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_1
; GFX11-FAKE16-NEXT: ; %bb.2:
@@ -8812,28 +8687,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-FAKE16-NEXT: ; Child Loop BB21_4 Depth 2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v9 :: v_dual_max_f32 v4, v4, v8
-; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v6, v6, v9 :: v_dual_max_f32 v5, v5, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v6, 16, 1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v5, 0x7060302
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v7
; GFX11-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -8847,14 +8722,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], 0 offen offset:1024 glc
; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_4
; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v5
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -8864,13 +8739,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v5
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -8882,8 +8756,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB21_1
@@ -8895,25 +8768,25 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB21_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f32_e32 v4, v4, v8
-; GFX10-NEXT: v_max_f32_e32 v5, v5, v9
-; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
-; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v8
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v9
+; GFX10-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v4, v5
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6
+; GFX10-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v6, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo
+; GFX10-NEXT: v_perm_b32 v6, v6, v5, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v5, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v7
; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
@@ -8925,15 +8798,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB21_4
; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
@@ -8942,13 +8815,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX10-NEXT: s_cbranch_execnz .LBB21_3
; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -8960,38 +8832,37 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX90A-NEXT: s_movk_i32 s14, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX90A-NEXT: s_mov_b32 s15, 0x7060302
; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v9
-; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_max_f32_e32 v5, v5, v10
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14
-; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v9
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v10
+; GFX90A-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX90A-NEXT: v_add3_u32 v7, v7, v6, s14
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
+; GFX90A-NEXT: v_max_f32_e32 v7, v7, v5
+; GFX90A-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX90A-NEXT: v_add3_u32 v8, v8, v7, s14
+; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc
+; GFX90A-NEXT: v_perm_b32 v8, v7, v6, s15
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -9003,27 +8874,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -9035,8 +8905,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB21_1
; GFX908-NEXT: ; %bb.2:
@@ -9050,24 +8919,24 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB21_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX908-NEXT: v_max_f32_e32 v4, v4, v8
-; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14
-; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX908-NEXT: v_max_f32_e32 v5, v5, v9
-; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14
-; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX908-NEXT: v_max_f32_e32 v5, v5, v8
+; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX908-NEXT: v_add3_u32 v6, v6, v5, s14
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
-; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
-; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v6, v10, vcc
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX908-NEXT: v_max_f32_e32 v6, v6, v9
+; GFX908-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX908-NEXT: v_add3_u32 v10, v10, v6, s14
+; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc
+; GFX908-NEXT: v_perm_b32 v6, v6, v5, s15
; GFX908-NEXT: v_mov_b32_e32 v5, v6
+; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v6, v7
; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -9079,27 +8948,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB21_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB21_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v0, v5
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -9111,8 +8979,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB21_1
; GFX8-NEXT: ; %bb.2:
@@ -9124,27 +8991,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB21_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT: v_max_f32_e32 v4, v4, v8
-; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT: v_max_f32_e32 v5, v5, v9
-; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
-; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v8
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
-; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX8-NEXT: v_max_f32_e32 v6, v6, v9
+; GFX8-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v6
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
+; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v6, v6, v5, 16
; GFX8-NEXT: v_mov_b32_e32 v5, v6
+; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v6, v7
; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -9156,27 +9023,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB21_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB21_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v0, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -9187,8 +9053,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB21_1
; GFX7-NEXT: ; %bb.2:
@@ -9196,27 +9061,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v5
; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7
-; GFX7-NEXT: v_max_f32_e32 v4, v4, v9
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v10
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v10
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_max_f32_e32 v8, v8, v11
+; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v5, v8, 16
+; GFX7-NEXT: v_mov_b32_e32 v9, v6
; GFX7-NEXT: s_mov_b64 s[12:13], exec
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v8, v5
; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -9228,23 +9093,23 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen offset:1024 glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB21_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX7-NEXT: s_mov_b64 exec, s[12:13]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v6
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB21_3
; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v7
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: v_mov_b32_e32 v0, v8
+; GFX7-NEXT: v_mov_b32_e32 v1, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -9353,10 +9218,9 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, s16
; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s6, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[4:5], 0
; GFX942-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, s6
+; GFX942-NEXT: v_mov_b32_e32 v3, s16
; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -9365,7 +9229,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
; GFX942-NEXT: v_max_f32_e32 v4, v0, v2
; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -9404,10 +9268,9 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -9416,7 +9279,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
@@ -9434,10 +9297,9 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -9446,7 +9308,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
; GFX908-NEXT: v_max_f32_e32 v4, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -9463,10 +9325,9 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -9475,7 +9336,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: v_max_f32_e32 v4, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 8ac6353..ed67e02 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -37,10 +37,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, s16
; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s6, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[4:5], 0
; GFX942-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, s6
+; GFX942-NEXT: v_mov_b32_e32 v3, s16
; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -49,7 +48,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX942-NEXT: v_min_f32_e32 v4, v0, v2
; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -88,10 +87,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -99,7 +97,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5
; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -116,10 +114,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -128,7 +125,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX908-NEXT: v_min_f32_e32 v4, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -145,10 +142,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -157,7 +153,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX8-NEXT: v_min_f32_e32 v4, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -212,10 +208,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s6, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[4:5], 0
; GFX942-NEXT: v_max_f32_e32 v2, v0, v0
-; GFX942-NEXT: v_mov_b32_e32 v3, s6
+; GFX942-NEXT: v_mov_b32_e32 v3, s16
; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -223,7 +218,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
; GFX942-NEXT: v_min_f32_e32 v0, v0, v2
; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -262,17 +257,16 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -289,10 +283,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v0, v0
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -300,7 +293,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
; GFX908-NEXT: v_min_f32_e32 v0, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v5, v1
; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -317,10 +310,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -328,7 +320,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -402,7 +394,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX942-NEXT: s_mov_b64 s[2:3], exec
; GFX942-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -414,22 +405,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX942-NEXT: ; implicit-def: $vgpr4
+; GFX942-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB2_1
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
; GFX942-NEXT: s_mov_b64 s[2:3], 0
-; GFX942-NEXT: v_max_f32_e32 v9, v5, v5
+; GFX942-NEXT: v_max_f32_e32 v5, v5, v5
; GFX942-NEXT: .LBB2_3: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Loop Header: Depth=1
; GFX942-NEXT: ; Child Loop BB2_4 Depth 2
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_max_f32_e32 v4, v7, v7
-; GFX942-NEXT: v_min_f32_e32 v6, v4, v9
+; GFX942-NEXT: v_max_f32_e32 v6, v9, v9
+; GFX942-NEXT: v_min_f32_e32 v8, v6, v5
; GFX942-NEXT: s_mov_b64 s[8:9], exec
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
@@ -443,21 +433,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen offset:1024 sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB2_4
; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
; GFX942-NEXT: s_mov_b64 exec, s[8:9]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v6
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB2_3
; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -522,7 +512,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -534,22 +523,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB2_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_max_f32_e32 v9, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v5, v5, v5
; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB2_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7
-; GFX90A-NEXT: v_min_f32_e32 v6, v4, v9
+; GFX90A-NEXT: v_max_f32_e32 v6, v9, v9
+; GFX90A-NEXT: v_min_f32_e32 v8, v6, v5
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -561,27 +549,26 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB2_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB2_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -593,8 +580,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB2_1
; GFX908-NEXT: ; %bb.2:
@@ -605,11 +591,11 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB2_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v4, v6, v6
-; GFX908-NEXT: v_min_f32_e32 v5, v4, v8
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
-; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_max_f32_e32 v5, v7, v7
+; GFX908-NEXT: v_min_f32_e32 v6, v5, v8
; GFX908-NEXT: v_mov_b32_e32 v5, v6
+; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v6, v7
; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -621,27 +607,26 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB2_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB2_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v0, v5
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -653,8 +638,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB2_1
; GFX8-NEXT: ; %bb.2:
@@ -665,11 +649,11 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB2_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v6
-; GFX8-NEXT: v_min_f32_e32 v5, v4, v8
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
-; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v7
+; GFX8-NEXT: v_min_f32_e32 v6, v5, v8
; GFX8-NEXT: v_mov_b32_e32 v5, v6
+; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v6, v7
; GFX8-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -681,21 +665,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB2_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB2_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v0, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -777,10 +761,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, s16
; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s6, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[4:5], 0
; GFX942-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, s6
+; GFX942-NEXT: v_mov_b32_e32 v3, s16
; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -789,7 +772,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX942-NEXT: v_min_f32_e32 v4, v0, v2
; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -804,11 +787,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, s16 :: v_dual_max_f32 v2, v1, v1
+; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -819,7 +801,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX11-NEXT: v_min_f32_e32 v4, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -837,11 +819,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_max_f32_e32 v2, v1, v1
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -851,7 +832,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX10-NEXT: v_min_f32_e32 v4, v0, v2
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -869,10 +850,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -880,7 +860,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5
; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -897,10 +877,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -909,7 +888,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX908-NEXT: v_min_f32_e32 v4, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -926,10 +905,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -938,7 +916,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX8-NEXT: v_min_f32_e32 v4, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -955,10 +933,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX7-NEXT: v_mov_b32_e32 v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s20
; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, s6
+; GFX7-NEXT: v_mov_b32_e32 v3, s20
; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -967,7 +944,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX7-NEXT: v_min_f32_e32 v4, v0, v2
; GFX7-NEXT: v_mov_b32_e32 v0, v4
; GFX7-NEXT: v_mov_b32_e32 v1, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1035,10 +1012,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, s16
; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s6, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[4:5], 0
; GFX942-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, s6
+; GFX942-NEXT: v_mov_b32_e32 v3, s16
; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -1047,7 +1023,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX942-NEXT: v_min_f32_e32 v4, v0, v2
; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1086,10 +1062,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -1097,7 +1072,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5
; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1114,10 +1089,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -1126,7 +1100,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX908-NEXT: v_min_f32_e32 v4, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1143,10 +1117,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1155,7 +1128,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX8-NEXT: v_min_f32_e32 v4, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1203,12 +1176,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: v_mov_b32_e32 v6, s16
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1220,7 +1192,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
@@ -1248,12 +1220,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x800
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: v_mov_b32_e32 v6, s16
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1265,7 +1236,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -1306,9 +1277,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v6, s6
+; GFX908-NEXT: v_mov_b32_e32 v6, s20
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -1320,7 +1290,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -1339,9 +1309,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: v_mov_b32_e32 v6, s20
; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1353,7 +1322,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -1397,11 +1366,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, s16
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v6, s4
-; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
+; GFX12-NEXT: v_mov_b32_e32 v6, s16
; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1411,7 +1378,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
@@ -1440,11 +1407,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, s16
; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX11-NEXT: s_add_i32 s4, s16, 0x800
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v6, s4
-; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
+; GFX11-NEXT: v_mov_b32_e32 v6, s16
; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1454,7 +1419,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen offset:2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -1494,9 +1459,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v2, s20
; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v6, s6
+; GFX908-NEXT: v_mov_b32_e32 v6, s20
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -1506,7 +1470,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v9, v2
; GFX908-NEXT: v_mov_b32_e32 v8, v1
; GFX908-NEXT: v_mov_b32_e32 v7, v0
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
@@ -1525,9 +1489,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v2, s20
; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: v_mov_b32_e32 v6, s20
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1537,7 +1500,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v9, v2
; GFX8-NEXT: v_mov_b32_e32 v8, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v0
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
@@ -1583,10 +1546,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4
; GFX12-NEXT: s_mov_b32 s1, exec_lo
; GFX12-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_readfirstlane_b32 s4, v9
; GFX12-NEXT: v_readfirstlane_b32 s5, v10
; GFX12-NEXT: v_readfirstlane_b32 s6, v7
@@ -1600,12 +1562,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
-; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
+; GFX12-NEXT: v_max_num_f64_e32 v[5:6], v[5:6], v[5:6]
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Loop Header: Depth=1
@@ -1615,7 +1576,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[4:5]
+; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[5:6]
; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1
@@ -1632,7 +1593,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v4, s[4:7], null offen offset:2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -1686,27 +1647,26 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s4, v9
; GFX11-NEXT: v_readfirstlane_b32 s5, v10
; GFX11-NEXT: v_readfirstlane_b32 s6, v7
; GFX11-NEXT: v_readfirstlane_b32 s7, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
-; GFX11-NEXT: ; implicit-def: $vgpr4
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB7_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX11-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
@@ -1716,7 +1676,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5]
+; GFX11-NEXT: v_min_f64 v[11:12], v[0:1], v[5:6]
; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1
@@ -1732,7 +1692,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v4, s[4:7], 0 offen offset:2048 glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB7_4
; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -1816,7 +1776,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX908-NEXT: v_mov_b32_e32 v7, v2
; GFX908-NEXT: v_mov_b32_e32 v10, v1
; GFX908-NEXT: v_mov_b32_e32 v9, v0
-; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v9
@@ -1829,12 +1788,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX908-NEXT: ; implicit-def: $vgpr4
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX908-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
@@ -1842,7 +1800,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14]
; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5]
+; GFX908-NEXT: v_min_f64 v[11:12], v[0:1], v[5:6]
; GFX908-NEXT: v_mov_b32_e32 v0, v11
; GFX908-NEXT: v_mov_b32_e32 v1, v12
; GFX908-NEXT: v_mov_b32_e32 v2, v13
@@ -1858,7 +1816,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v4, s[8:11], 0 offen offset:2048 glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -1882,7 +1840,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX8-NEXT: v_mov_b32_e32 v7, v2
; GFX8-NEXT: v_mov_b32_e32 v10, v1
; GFX8-NEXT: v_mov_b32_e32 v9, v0
-; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v9
@@ -1895,12 +1852,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX8-NEXT: ; implicit-def: $vgpr4
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB7_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX8-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
@@ -1908,7 +1864,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14]
; GFX8-NEXT: s_mov_b64 s[12:13], exec
-; GFX8-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5]
+; GFX8-NEXT: v_min_f64 v[11:12], v[0:1], v[5:6]
; GFX8-NEXT: v_mov_b32_e32 v0, v11
; GFX8-NEXT: v_mov_b32_e32 v1, v12
; GFX8-NEXT: v_mov_b32_e32 v2, v13
@@ -1924,7 +1880,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v4, s[8:11], 0 offen offset:2048 glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB7_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -2010,12 +1966,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: v_mov_b32_e32 v6, s16
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2027,7 +1982,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
@@ -2055,12 +2010,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x800
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: v_mov_b32_e32 v6, s16
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2072,7 +2026,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -2091,11 +2045,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: s_add_i32 s4, s20, 0x800
-; GFX10-NEXT: v_mov_b32_e32 v6, s4
+; GFX10-NEXT: v_mov_b32_e32 v6, s20
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -2108,7 +2061,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: v_mov_b32_e32 v2, v9
; GFX10-NEXT: v_mov_b32_e32 v3, v10
-; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -2127,10 +2080,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x800
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT: v_mov_b32_e32 v6, s6
+; GFX90A-NEXT: v_mov_b32_e32 v6, s20
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -2139,7 +2091,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX90A-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
@@ -2158,9 +2110,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v6, s6
+; GFX908-NEXT: v_mov_b32_e32 v6, s20
; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -2172,7 +2123,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2191,9 +2142,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: v_mov_b32_e32 v6, s20
; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -2205,7 +2155,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2223,10 +2173,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX7-NEXT: v_mov_b32_e32 v0, s20
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX7-NEXT: s_add_i32 s6, s20, 0x800
; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
+; GFX7-NEXT: v_mov_b32_e32 v6, s20
; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -2238,7 +2187,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX7-NEXT: v_mov_b32_e32 v1, v8
; GFX7-NEXT: v_mov_b32_e32 v2, v9
; GFX7-NEXT: v_mov_b32_e32 v3, v10
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2298,12 +2247,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: v_mov_b32_e32 v6, s16
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2315,7 +2263,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
@@ -2343,12 +2291,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x800
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: v_mov_b32_e32 v6, s16
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2360,7 +2307,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -2401,9 +2348,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v6, s6
+; GFX908-NEXT: v_mov_b32_e32 v6, s20
; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -2415,7 +2361,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2434,9 +2380,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: v_mov_b32_e32 v6, s20
; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -2448,7 +2393,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -6146,13 +6091,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v3, s4
+; GFX12-NEXT: v_mov_b32_e32 v3, s16
+; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1
; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6163,7 +6106,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX12-NEXT: v_pk_min_num_f16 v4, v0, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
@@ -6182,10 +6125,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, s16
; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s6, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[4:5], 0
; GFX942-NEXT: v_pk_max_f16 v2, v1, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, s6
+; GFX942-NEXT: v_mov_b32_e32 v3, s16
; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -6195,7 +6137,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX942-NEXT: v_pk_min_f16 v4, v0, v2
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
-; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -6210,12 +6152,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-NEXT: v_mov_b32_e32 v3, s16
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_pk_max_f16 v2, v1, v1
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -6226,7 +6167,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX11-NEXT: v_pk_min_f16 v4, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -6244,11 +6185,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_pk_max_f16 v2, v1, v1
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -6258,7 +6198,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX10-NEXT: v_pk_min_f16 v4, v0, v2
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -6276,10 +6216,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6287,7 +6226,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5
; GFX90A-NEXT: v_pk_min_f16 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -6304,10 +6243,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_pk_max_f16 v2, v1, v1
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -6316,7 +6254,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX908-NEXT: v_pk_min_f16 v4, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -6333,11 +6271,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v1, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -6349,7 +6286,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX8-NEXT: v_or_b32_e32 v5, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v5
; GFX8-NEXT: v_mov_b32_e32 v1, v6
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
@@ -6367,7 +6304,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -6375,7 +6311,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-NEXT: v_mov_b32_e32 v4, s20
; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -6392,7 +6328,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX7-NEXT: v_or_b32_e32 v5, v7, v0
; GFX7-NEXT: v_mov_b32_e32 v8, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
@@ -6467,10 +6403,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s16
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v3, s4
+; GFX12-NEXT: v_mov_b32_e32 v3, s16
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
@@ -6481,7 +6415,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
@@ -6500,10 +6434,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s6, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[4:5], 0
; GFX942-NEXT: v_pk_max_f16 v2, v0, v0
-; GFX942-NEXT: v_mov_b32_e32 v3, s6
+; GFX942-NEXT: v_mov_b32_e32 v3, s16
; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -6512,7 +6445,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX942-NEXT: v_pk_min_f16 v0, v0, v2
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -6528,9 +6461,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-NEXT: v_pk_max_f16 v2, v0, v0
-; GFX11-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-NEXT: v_mov_b32_e32 v3, s16
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
@@ -6541,7 +6473,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_min_f16 v0, v0, v2
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -6559,9 +6491,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
; GFX10-NEXT: v_pk_max_f16 v2, v0, v0
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s20
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
@@ -6572,7 +6503,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX10-NEXT: v_pk_min_f16 v0, v0, v2
; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -6590,17 +6521,16 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -6617,10 +6547,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_pk_max_f16 v2, v0, v0
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -6628,7 +6557,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX908-NEXT: v_pk_min_f16 v0, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v5, v1
; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -6645,11 +6574,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -6660,7 +6588,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: v_mov_b32_e32 v6, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
@@ -6679,7 +6607,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -6687,7 +6614,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
-; GFX7-NEXT: v_mov_b32_e32 v2, s6
+; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
@@ -6704,7 +6631,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX7-NEXT: v_or_b32_e32 v4, v6, v3
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
@@ -6778,7 +6705,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX12-NEXT: s_mov_b32 s1, exec_lo
; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -6793,8 +6719,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
-; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: buffer_load_b32 v7, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2:
@@ -6805,13 +6730,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: ; =>This Loop Header: Depth=1
; GFX12-NEXT: ; Child Loop BB18_4 Depth 2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6
+; GFX12-NEXT: v_pk_max_num_f16 v5, v7, v7
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_min_num_f16 v5, v4, v8
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-NEXT: v_pk_min_num_f16 v6, v5, v8
; GFX12-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-NEXT: v_mov_b32_e32 v6, v7
; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -6826,14 +6751,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX12-NEXT: v_mov_b32_e32 v7, v5
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6841,14 +6766,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_cbranch_execnz .LBB18_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-NEXT: v_mov_b32_e32 v0, v5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX942-NEXT: s_mov_b64 s[2:3], exec
; GFX942-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -6860,23 +6784,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX942-NEXT: ; implicit-def: $vgpr4
+; GFX942-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB18_1
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
; GFX942-NEXT: s_mov_b64 s[2:3], 0
-; GFX942-NEXT: v_pk_max_f16 v9, v5, v5
+; GFX942-NEXT: v_pk_max_f16 v5, v5, v5
; GFX942-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Loop Header: Depth=1
; GFX942-NEXT: ; Child Loop BB18_4 Depth 2
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_pk_max_f16 v4, v7, v7
+; GFX942-NEXT: v_pk_max_f16 v6, v9, v9
; GFX942-NEXT: s_mov_b64 s[8:9], exec
-; GFX942-NEXT: v_pk_min_f16 v6, v4, v9
+; GFX942-NEXT: v_pk_min_f16 v8, v6, v5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -6889,27 +6812,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen offset:1024 sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB18_4
; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX942-NEXT: s_mov_b64 exec, s[8:9]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v6
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB18_3
; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
@@ -6923,8 +6845,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: buffer_load_b32 v7, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB18_1
; GFX11-NEXT: ; %bb.2:
@@ -6935,13 +6856,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB18_4 Depth 2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v4, v6, v6
+; GFX11-NEXT: v_pk_max_f16 v5, v7, v7
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_min_f16 v5, v4, v8
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-NEXT: v_pk_min_f16 v6, v5, v8
; GFX11-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-NEXT: v_mov_b32_e32 v6, v7
; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -6955,14 +6876,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], 0 offen offset:1024 glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB18_4
; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-NEXT: v_mov_b32_e32 v7, v5
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -6971,13 +6892,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_cbranch_execnz .LBB18_3
; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v4
+; GFX11-NEXT: v_mov_b32_e32 v0, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
@@ -6989,8 +6909,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB18_1
@@ -7001,12 +6920,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB18_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v4, v6, v6
+; GFX10-NEXT: v_pk_max_f16 v5, v7, v7
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_pk_min_f16 v5, v4, v8
-; GFX10-NEXT: v_mov_b32_e32 v4, v5
+; GFX10-NEXT: v_pk_min_f16 v6, v5, v8
; GFX10-NEXT: v_mov_b32_e32 v5, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v7
; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
@@ -7018,15 +6937,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB18_4
; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
@@ -7035,13 +6954,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX10-NEXT: s_cbranch_execnz .LBB18_3
; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -7053,22 +6971,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5
+; GFX90A-NEXT: v_pk_max_f16 v5, v5, v5
; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7
-; GFX90A-NEXT: v_pk_min_f16 v6, v4, v9
+; GFX90A-NEXT: v_pk_max_f16 v6, v9, v9
+; GFX90A-NEXT: v_pk_min_f16 v8, v6, v5
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -7080,27 +6997,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -7112,8 +7028,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB18_1
; GFX908-NEXT: ; %bb.2:
@@ -7124,11 +7039,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB18_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v4, v6, v6
-; GFX908-NEXT: v_pk_min_f16 v5, v4, v8
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
-; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_pk_max_f16 v5, v7, v7
+; GFX908-NEXT: v_pk_min_f16 v6, v5, v8
; GFX908-NEXT: v_mov_b32_e32 v5, v6
+; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v6, v7
; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -7140,27 +7055,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB18_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB18_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v0, v5
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -7172,8 +7086,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB18_1
; GFX8-NEXT: ; %bb.2:
@@ -7185,14 +7098,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB18_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
-; GFX8-NEXT: v_min_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v5, v5, v9
-; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
-; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_max_f16_sdwa v5, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v6, v7, v7
+; GFX8-NEXT: v_min_f16_sdwa v5, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v6, v6, v9
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v5
; GFX8-NEXT: v_mov_b32_e32 v5, v6
+; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v6, v7
; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -7204,27 +7117,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB18_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB18_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v0, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -7235,39 +7147,38 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB18_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v8
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v9
; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB18_4 Depth 2
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v7
; GFX7-NEXT: s_mov_b64 s[12:13], exec
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_min_f32_e32 v6, v6, v10
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v11
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v5
+; GFX7-NEXT: v_min_f32_e32 v7, v7, v10
+; GFX7-NEXT: v_min_f32_e32 v8, v8, v11
; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT: v_mov_b32_e32 v8, v6
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v5, v8, v5
+; GFX7-NEXT: v_mov_b32_e32 v9, v6
+; GFX7-NEXT: v_mov_b32_e32 v8, v5
; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -7279,23 +7190,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen offset:1024 glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB18_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX7-NEXT: s_mov_b64 exec, s[12:13]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v8
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v6
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB18_3
; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v0, v7
; GFX7-NEXT: v_mov_b32_e32 v1, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -7396,13 +7307,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v1
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@@ -7431,7 +7340,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
@@ -7452,11 +7361,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400
; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v1
; GFX12-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
@@ -7487,7 +7394,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX12-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
@@ -7506,13 +7413,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, s16
; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s4, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[6:7], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
-; GFX942-NEXT: v_mov_b32_e32 v4, s4
+; GFX942-NEXT: v_mov_b32_e32 v4, s16
; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -7534,7 +7440,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9
; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[6:7]
-; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
@@ -7549,12 +7455,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v1
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
@@ -7583,7 +7488,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -7601,10 +7506,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v1
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
@@ -7635,7 +7539,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
@@ -7654,9 +7558,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v4, s20
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
@@ -7682,7 +7585,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: v_mov_b32_e32 v1, v6
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -7700,13 +7603,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s20
; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -7727,7 +7629,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
@@ -7744,13 +7646,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: v_mov_b32_e32 v4, s4
+; GFX908-NEXT: v_mov_b32_e32 v4, s20
; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -7772,7 +7673,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9
; GFX908-NEXT: v_mov_b32_e32 v0, v5
; GFX908-NEXT: v_mov_b32_e32 v1, v6
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
@@ -7789,11 +7690,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -7818,7 +7718,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16
; GFX8-NEXT: v_mov_b32_e32 v0, v5
; GFX8-NEXT: v_mov_b32_e32 v1, v6
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
@@ -7834,7 +7734,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
@@ -7843,7 +7742,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-NEXT: v_mov_b32_e32 v4, s20
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
@@ -7858,7 +7757,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
; GFX7-NEXT: v_mov_b32_e32 v6, v1
; GFX7-NEXT: v_mov_b32_e32 v5, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
@@ -7928,11 +7827,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@@ -7958,7 +7855,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
@@ -7980,11 +7877,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v0
; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
@@ -8010,7 +7905,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
@@ -8029,13 +7924,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s4, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[6:7], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
-; GFX942-NEXT: v_mov_b32_e32 v4, s4
+; GFX942-NEXT: v_mov_b32_e32 v4, s16
; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -8056,7 +7950,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
@@ -8072,11 +7966,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start
@@ -8102,7 +7994,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -8121,11 +8013,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start
@@ -8151,7 +8041,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
@@ -8170,12 +8060,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s4, s20, 0x400
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: v_mov_b32_e32 v4, s20
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -8197,7 +8086,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v6, v1
; GFX10-NEXT: v_mov_b32_e32 v5, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -8215,13 +8104,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s20
; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -8241,7 +8129,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
@@ -8258,13 +8146,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: v_mov_b32_e32 v4, s4
+; GFX908-NEXT: v_mov_b32_e32 v4, s20
; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -8285,7 +8172,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX908-NEXT: v_mov_b32_e32 v6, v1
; GFX908-NEXT: v_mov_b32_e32 v5, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
@@ -8302,11 +8189,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -8330,7 +8216,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
; GFX8-NEXT: v_mov_b32_e32 v6, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
@@ -8347,7 +8233,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
@@ -8356,7 +8241,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, s6
+; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
@@ -8371,7 +8256,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
@@ -8440,7 +8325,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo
; GFX12-TRUE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -8455,8 +8339,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
-; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4
+; GFX12-TRUE16-NEXT: buffer_load_b32 v7, v4, s[4:7], null offen offset:1024
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-TRUE16-NEXT: ; %bb.2:
@@ -8468,30 +8351,30 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX12-TRUE16-NEXT: ; Child Loop BB21_4 Depth 2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v8 :: v_dual_min_num_f32 v4, v4, v9
-; GFX12-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v6, v6, v8 :: v_dual_min_num_f32 v5, v5, v9
+; GFX12-TRUE16-NEXT: v_bfe_u32 v11, v6, 16, 1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX12-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX12-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v11, v11, v6, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7
; GFX12-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -8506,14 +8389,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_4
; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v5
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -8521,7 +8404,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_3
; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v5
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -8532,7 +8415,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo
; GFX12-FAKE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -8547,8 +8429,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
-; GFX12-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX12-FAKE16-NEXT: buffer_load_b32 v7, v4, s[4:7], null offen offset:1024
; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-FAKE16-NEXT: ; %bb.2:
@@ -8560,30 +8441,30 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1
; GFX12-FAKE16-NEXT: ; Child Loop BB21_4 Depth 2
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v9 :: v_dual_min_num_f32 v4, v4, v8
-; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v6, v6, v9 :: v_dual_min_num_f32 v5, v5, v8
+; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v6, 16, 1
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX12-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX12-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v11, v11, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v6, v6, v5, 0x7060302
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v7
; GFX12-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -8598,14 +8479,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_4
; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v7, v5
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -8613,14 +8494,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_3
; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v5
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX942-NEXT: s_mov_b64 s[2:3], exec
; GFX942-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -8632,40 +8512,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX942-NEXT: ; implicit-def: $vgpr4
+; GFX942-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB21_1
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
; GFX942-NEXT: s_mov_b64 s[2:3], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX942-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX942-NEXT: s_movk_i32 s10, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX942-NEXT: s_mov_b32 s11, 0x7060302
; GFX942-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Loop Header: Depth=1
; GFX942-NEXT: ; Child Loop BB21_4 Depth 2
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v7
-; GFX942-NEXT: v_min_f32_e32 v4, v4, v9
-; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX942-NEXT: v_add3_u32 v5, v5, v4, s10
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v9
+; GFX942-NEXT: v_min_f32_e32 v6, v6, v10
+; GFX942-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX942-NEXT: v_add3_u32 v7, v7, v6, s10
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX942-NEXT: s_mov_b64 s[8:9], exec
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_min_f32_e32 v5, v5, v10
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s10
-; GFX942-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
+; GFX942-NEXT: v_min_f32_e32 v7, v7, v5
+; GFX942-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX942-NEXT: v_add3_u32 v8, v8, v7, s10
+; GFX942-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
-; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
+; GFX942-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc
+; GFX942-NEXT: v_perm_b32 v8, v7, v6, s11
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
; GFX942-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -8678,27 +8557,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen offset:1024 sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB21_4
; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX942-NEXT: s_mov_b64 exec, s[8:9]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v6
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB21_3
; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -8712,8 +8590,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v4, s[4:7], 0 offen offset:1024
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_1
; GFX11-TRUE16-NEXT: ; %bb.2:
@@ -8726,28 +8603,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-TRUE16-NEXT: ; Child Loop BB21_4 Depth 2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v8 :: v_dual_min_f32 v4, v4, v9
-; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_dual_min_f32 v6, v6, v8 :: v_dual_min_f32 v5, v5, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v6, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7
; GFX11-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -8761,14 +8638,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], 0 offen offset:1024 glc
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_4
; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v5
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -8778,13 +8655,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v5
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-FAKE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -8798,8 +8674,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT: buffer_load_b32 v7, v4, s[4:7], 0 offen offset:1024
; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_1
; GFX11-FAKE16-NEXT: ; %bb.2:
@@ -8812,28 +8687,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-FAKE16-NEXT: ; Child Loop BB21_4 Depth 2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v9 :: v_dual_min_f32 v4, v4, v8
-; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_min_f32 v6, v6, v9 :: v_dual_min_f32 v5, v5, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v6, 16, 1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v5, 0x7060302
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v7
; GFX11-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -8847,14 +8722,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], 0 offen offset:1024 glc
; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_4
; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v5
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -8864,13 +8739,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v5
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -8882,8 +8756,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB21_1
@@ -8895,25 +8768,25 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB21_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_min_f32_e32 v4, v4, v8
-; GFX10-NEXT: v_min_f32_e32 v5, v5, v9
-; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
-; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX10-NEXT: v_min_f32_e32 v5, v5, v8
+; GFX10-NEXT: v_min_f32_e32 v6, v6, v9
+; GFX10-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v4, v5
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6
+; GFX10-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v6, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo
+; GFX10-NEXT: v_perm_b32 v6, v6, v5, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v5, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v7
; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
@@ -8925,15 +8798,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB21_4
; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
@@ -8942,13 +8815,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX10-NEXT: s_cbranch_execnz .LBB21_3
; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -8960,38 +8832,37 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX90A-NEXT: s_movk_i32 s14, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX90A-NEXT: s_mov_b32 s15, 0x7060302
; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v9
-; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_min_f32_e32 v5, v5, v10
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14
-; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v9
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v10
+; GFX90A-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX90A-NEXT: v_add3_u32 v7, v7, v6, s14
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
+; GFX90A-NEXT: v_min_f32_e32 v7, v7, v5
+; GFX90A-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX90A-NEXT: v_add3_u32 v8, v8, v7, s14
+; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc
+; GFX90A-NEXT: v_perm_b32 v8, v7, v6, s15
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -9003,27 +8874,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -9035,8 +8905,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB21_1
; GFX908-NEXT: ; %bb.2:
@@ -9050,24 +8919,24 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB21_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX908-NEXT: v_min_f32_e32 v4, v4, v8
-; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14
-; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX908-NEXT: v_min_f32_e32 v5, v5, v9
-; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14
-; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX908-NEXT: v_min_f32_e32 v5, v5, v8
+; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX908-NEXT: v_add3_u32 v6, v6, v5, s14
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
-; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
-; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v6, v10, vcc
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX908-NEXT: v_min_f32_e32 v6, v6, v9
+; GFX908-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX908-NEXT: v_add3_u32 v10, v10, v6, s14
+; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc
+; GFX908-NEXT: v_perm_b32 v6, v6, v5, s15
; GFX908-NEXT: v_mov_b32_e32 v5, v6
+; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v6, v7
; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -9079,27 +8948,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB21_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB21_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v0, v5
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -9111,8 +8979,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB21_1
; GFX8-NEXT: ; %bb.2:
@@ -9124,27 +8991,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB21_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT: v_min_f32_e32 v4, v4, v8
-; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT: v_min_f32_e32 v5, v5, v9
-; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
-; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v8
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
-; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX8-NEXT: v_min_f32_e32 v6, v6, v9
+; GFX8-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v6
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
+; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v6, v6, v5, 16
; GFX8-NEXT: v_mov_b32_e32 v5, v6
+; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v6, v7
; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -9156,27 +9023,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB21_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB21_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v0, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -9187,8 +9053,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB21_1
; GFX7-NEXT: ; %bb.2:
@@ -9196,27 +9061,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v5
; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7
-; GFX7-NEXT: v_min_f32_e32 v4, v4, v9
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v10
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v10
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_min_f32_e32 v8, v8, v11
+; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v5, v8, 16
+; GFX7-NEXT: v_mov_b32_e32 v9, v6
; GFX7-NEXT: s_mov_b64 s[12:13], exec
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v8, v5
; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -9228,23 +9093,23 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen offset:1024 glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB21_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX7-NEXT: s_mov_b64 exec, s[12:13]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v6
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB21_3
; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v7
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: v_mov_b32_e32 v0, v8
+; GFX7-NEXT: v_mov_b32_e32 v1, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -9353,10 +9218,9 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, s16
; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT: s_add_i32 s6, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[4:5], 0
; GFX942-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, s6
+; GFX942-NEXT: v_mov_b32_e32 v3, s16
; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -9365,7 +9229,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
; GFX942-NEXT: v_min_f32_e32 v4, v0, v2
; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -9404,10 +9268,9 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s20
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -9416,7 +9279,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
@@ -9434,10 +9297,9 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT: v_mov_b32_e32 v3, s6
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -9446,7 +9308,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
; GFX908-NEXT: v_min_f32_e32 v4, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -9463,10 +9325,9 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -9475,7 +9336,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: v_min_f32_e32 v4, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index 3c991cf..afd0f01 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -782,69 +782,90 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; SDAG-GFX942-LABEL: memcpy_known_medium:
; SDAG-GFX942: ; %bb.0:
; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; SDAG-GFX942-NEXT: s_load_dword s13, s[4:5], 0x34
+; SDAG-GFX942-NEXT: s_load_dword s17, s[4:5], 0x34
; SDAG-GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x44
-; SDAG-GFX942-NEXT: s_load_dword s14, s[4:5], 0x54
-; SDAG-GFX942-NEXT: s_mov_b32 s12, 0
-; SDAG-GFX942-NEXT: s_mov_b32 s5, s12
-; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-GFX942-NEXT: s_load_dword s12, s[4:5], 0x54
+; SDAG-GFX942-NEXT: s_mov_b32 s16, 0
+; SDAG-GFX942-NEXT: s_mov_b32 s5, s16
; SDAG-GFX942-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX942-NEXT: s_mov_b32 s4, s3
-; SDAG-GFX942-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
-; SDAG-GFX942-NEXT: s_mov_b32 s13, s2
+; SDAG-GFX942-NEXT: s_or_b64 s[6:7], s[4:5], s[16:17]
+; SDAG-GFX942-NEXT: s_mov_b32 s17, s2
; SDAG-GFX942-NEXT: s_mov_b32 s2, s1
-; SDAG-GFX942-NEXT: s_mov_b32 s3, s12
-; SDAG-GFX942-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
-; SDAG-GFX942-NEXT: s_mov_b32 s13, s14
+; SDAG-GFX942-NEXT: s_mov_b32 s3, s16
+; SDAG-GFX942-NEXT: s_or_b64 s[4:5], s[2:3], s[16:17]
+; SDAG-GFX942-NEXT: s_mov_b32 s17, s12
; SDAG-GFX942-NEXT: s_mov_b32 s2, s11
-; SDAG-GFX942-NEXT: s_or_b64 s[14:15], s[2:3], s[12:13]
-; SDAG-GFX942-NEXT: s_mov_b32 s13, s10
+; SDAG-GFX942-NEXT: s_or_b64 s[14:15], s[2:3], s[16:17]
+; SDAG-GFX942-NEXT: s_mov_b32 s17, s10
; SDAG-GFX942-NEXT: s_mov_b32 s2, s9
-; SDAG-GFX942-NEXT: s_or_b64 s[12:13], s[2:3], s[12:13]
+; SDAG-GFX942-NEXT: s_or_b64 s[12:13], s[2:3], s[16:17]
; SDAG-GFX942-NEXT: .LBB1_1: ; %load-store-loop
; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; SDAG-GFX942-NEXT: v_add_u32_e32 v1, s0, v0
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[4:7], 0 offen
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[4:7], 0 offen offset:16
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:32
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[4:7], 0 offen offset:48
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[4:7], 0 offen offset:64
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[4:7], 0 offen offset:80
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v1, s[4:7], 0 offen offset:96
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v1, s[4:7], 0 offen offset:112
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v1, s[4:7], 0 offen offset:128
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v1, s[4:7], 0 offen offset:144
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v1, s[4:7], 0 offen offset:160
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v1, s[4:7], 0 offen offset:176
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[4:7], 0 offen offset:192
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240
-; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0
-; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
-; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
-; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
-; SDAG-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[12:15], 0 offen offset:16
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[12:15], 0 offen offset:32
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[12:15], 0 offen offset:48
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[12:15], 0 offen offset:64
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[12:15], 0 offen offset:80
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v62, s[12:15], 0 offen offset:96
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v62, s[12:15], 0 offen offset:112
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v62, s[12:15], 0 offen offset:128
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v62, s[12:15], 0 offen offset:144
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v62, s[12:15], 0 offen offset:160
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v62, s[12:15], 0 offen offset:176
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[12:15], 0 offen offset:192
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[12:15], 0 offen offset:208
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[12:15], 0 offen offset:224
-; SDAG-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
+; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16
+; SDAG-GFX942-NEXT: v_mov_b32_e32 v60, s1
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v60, s[4:7], 0 offen
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v60, s[4:7], 0 offen offset:16
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32
+; SDAG-GFX942-NEXT: s_add_i32 s2, s8, s16
+; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-GFX942-NEXT: s_addk_i32 s16, 0x100
+; SDAG-GFX942-NEXT: s_cmpk_lt_u32 s16, 0x100
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:240
-; SDAG-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
+; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v15 ; Reload Reuse
+; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v14 ; Reload Reuse
+; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v13 ; Reload Reuse
+; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v12 ; Reload Reuse
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:48
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[16:19], v60, s[4:7], 0 offen offset:64
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[20:23], v60, s[4:7], 0 offen offset:80
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[24:27], v60, s[4:7], 0 offen offset:96
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[28:31], v60, s[4:7], 0 offen offset:112
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[32:35], v60, s[4:7], 0 offen offset:128
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[36:39], v60, s[4:7], 0 offen offset:144
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[40:43], v60, s[4:7], 0 offen offset:160
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[44:47], v60, s[4:7], 0 offen offset:176
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[48:51], v60, s[4:7], 0 offen offset:192
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[52:55], v60, s[4:7], 0 offen offset:208
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v60, s[4:7], 0 offen offset:224
+; SDAG-GFX942-NEXT: s_nop 0
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v60, s[4:7], 0 offen offset:240
+; SDAG-GFX942-NEXT: s_nop 0
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v0, s[12:15], 0 offen
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v0, s[12:15], 0 offen offset:16
+; SDAG-GFX942-NEXT: s_nop 1
+; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
+; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
+; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
+; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen offset:32
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[12:15], v0, s[12:15], 0 offen offset:48
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[16:19], v0, s[12:15], 0 offen offset:64
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[20:23], v0, s[12:15], 0 offen offset:80
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[24:27], v0, s[12:15], 0 offen offset:96
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[28:31], v0, s[12:15], 0 offen offset:112
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[32:35], v0, s[12:15], 0 offen offset:128
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[36:39], v0, s[12:15], 0 offen offset:144
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[40:43], v0, s[12:15], 0 offen offset:160
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[44:47], v0, s[12:15], 0 offen offset:176
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[48:51], v0, s[12:15], 0 offen offset:192
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[52:55], v0, s[12:15], 0 offen offset:208
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[56:59], v0, s[12:15], 0 offen offset:224
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[60:63], v0, s[12:15], 0 offen offset:240
+; SDAG-GFX942-NEXT: s_cbranch_scc1 .LBB1_1
; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split
; SDAG-GFX942-NEXT: s_endpgm
;
@@ -852,84 +873,87 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; SDAG-GFX1100: ; %bb.0:
; SDAG-GFX1100-NEXT: s_clause 0x3
; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; SDAG-GFX1100-NEXT: s_load_b32 s13, s[4:5], 0x34
+; SDAG-GFX1100-NEXT: s_load_b32 s17, s[4:5], 0x34
; SDAG-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x44
; SDAG-GFX1100-NEXT: s_load_b32 s18, s[4:5], 0x54
-; SDAG-GFX1100-NEXT: s_mov_b32 s12, 0
-; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, 0
-; SDAG-GFX1100-NEXT: s_mov_b32 s5, s12
-; SDAG-GFX1100-NEXT: s_mov_b32 s15, s12
-; SDAG-GFX1100-NEXT: s_mov_b32 s17, s12
+; SDAG-GFX1100-NEXT: s_mov_b32 s16, 0
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-GFX1100-NEXT: s_mov_b32 s5, s16
+; SDAG-GFX1100-NEXT: s_mov_b32 s13, s16
+; SDAG-GFX1100-NEXT: s_mov_b32 s15, s16
; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX1100-NEXT: s_mov_b32 s4, s3
-; SDAG-GFX1100-NEXT: s_mov_b32 s14, s1
-; SDAG-GFX1100-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
-; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2
-; SDAG-GFX1100-NEXT: s_mov_b32 s16, s11
-; SDAG-GFX1100-NEXT: s_or_b64 s[4:5], s[14:15], s[12:13]
-; SDAG-GFX1100-NEXT: s_mov_b32 s13, s18
+; SDAG-GFX1100-NEXT: s_mov_b32 s12, s1
+; SDAG-GFX1100-NEXT: s_or_b64 s[6:7], s[4:5], s[16:17]
+; SDAG-GFX1100-NEXT: s_mov_b32 s17, s2
+; SDAG-GFX1100-NEXT: s_mov_b32 s14, s11
+; SDAG-GFX1100-NEXT: s_or_b64 s[4:5], s[12:13], s[16:17]
+; SDAG-GFX1100-NEXT: s_mov_b32 s17, s18
; SDAG-GFX1100-NEXT: s_mov_b32 s2, s9
-; SDAG-GFX1100-NEXT: s_or_b64 s[14:15], s[16:17], s[12:13]
-; SDAG-GFX1100-NEXT: s_mov_b32 s13, s10
-; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12
+; SDAG-GFX1100-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17]
+; SDAG-GFX1100-NEXT: s_mov_b32 s17, s10
+; SDAG-GFX1100-NEXT: s_mov_b32 s3, s16
; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; SDAG-GFX1100-NEXT: s_or_b64 s[12:13], s[2:3], s[12:13]
+; SDAG-GFX1100-NEXT: s_or_b64 s[12:13], s[2:3], s[16:17]
; SDAG-GFX1100-NEXT: .LBB1_1: ; %load-store-loop
; SDAG-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
-; SDAG-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0
-; SDAG-GFX1100-NEXT: v_add_nc_u32_e32 v65, s8, v0
-; SDAG-GFX1100-NEXT: v_add_co_u32 v0, s1, 0x100, v0
-; SDAG-GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s1
+; SDAG-GFX1100-NEXT: s_add_i32 s1, s0, s16
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-GFX1100-NEXT: v_mov_b32_e32 v60, s1
+; SDAG-GFX1100-NEXT: s_add_i32 s1, s8, s16
+; SDAG-GFX1100-NEXT: s_addk_i32 s16, 0x100
+; SDAG-GFX1100-NEXT: v_mov_b32_e32 v64, s1
+; SDAG-GFX1100-NEXT: s_cmpk_lt_u32 s16, 0x100
; SDAG-GFX1100-NEXT: s_clause 0xf
-; SDAG-GFX1100-NEXT: buffer_load_b128 v[1:4], v61, s[4:7], 0 offen
-; SDAG-GFX1100-NEXT: buffer_load_b128 v[5:8], v61, s[4:7], 0 offen offset:16
-; SDAG-GFX1100-NEXT: buffer_load_b128 v[9:12], v61, s[4:7], 0 offen offset:32
-; SDAG-GFX1100-NEXT: buffer_load_b128 v[13:16], v61, s[4:7], 0 offen offset:48
-; SDAG-GFX1100-NEXT: buffer_load_b128 v[17:20], v61, s[4:7], 0 offen offset:64
-; SDAG-GFX1100-NEXT: buffer_load_b128 v[21:24], v61, s[4:7], 0 offen offset:80
-; SDAG-GFX1100-NEXT: buffer_load_b128 v[25:28], v61, s[4:7], 0 offen offset:96
-; SDAG-GFX1100-NEXT: buffer_load_b128 v[29:32], v61, s[4:7], 0 offen offset:112
-; SDAG-GFX1100-NEXT: buffer_load_b128 v[33:36], v61, s[4:7], 0 offen offset:128
-; SDAG-GFX1100-NEXT: buffer_load_b128 v[37:40], v61, s[4:7], 0 offen offset:144
-; SDAG-GFX1100-NEXT: buffer_load_b128 v[41:44], v61, s[4:7], 0 offen offset:160
-; SDAG-GFX1100-NEXT: buffer_load_b128 v[45:48], v61, s[4:7], 0 offen offset:176
-; SDAG-GFX1100-NEXT: buffer_load_b128 v[49:52], v61, s[4:7], 0 offen offset:192
-; SDAG-GFX1100-NEXT: buffer_load_b128 v[53:56], v61, s[4:7], 0 offen offset:208
-; SDAG-GFX1100-NEXT: buffer_load_b128 v[57:60], v61, s[4:7], 0 offen offset:224
-; SDAG-GFX1100-NEXT: buffer_load_b128 v[61:64], v61, s[4:7], 0 offen offset:240
+; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v60, s[4:7], 0 offen
+; SDAG-GFX1100-NEXT: buffer_load_b128 v[4:7], v60, s[4:7], 0 offen offset:16
+; SDAG-GFX1100-NEXT: buffer_load_b128 v[8:11], v60, s[4:7], 0 offen offset:32
+; SDAG-GFX1100-NEXT: buffer_load_b128 v[12:15], v60, s[4:7], 0 offen offset:48
+; SDAG-GFX1100-NEXT: buffer_load_b128 v[16:19], v60, s[4:7], 0 offen offset:64
+; SDAG-GFX1100-NEXT: buffer_load_b128 v[20:23], v60, s[4:7], 0 offen offset:80
+; SDAG-GFX1100-NEXT: buffer_load_b128 v[24:27], v60, s[4:7], 0 offen offset:96
+; SDAG-GFX1100-NEXT: buffer_load_b128 v[28:31], v60, s[4:7], 0 offen offset:112
+; SDAG-GFX1100-NEXT: buffer_load_b128 v[32:35], v60, s[4:7], 0 offen offset:128
+; SDAG-GFX1100-NEXT: buffer_load_b128 v[36:39], v60, s[4:7], 0 offen offset:144
+; SDAG-GFX1100-NEXT: buffer_load_b128 v[40:43], v60, s[4:7], 0 offen offset:160
+; SDAG-GFX1100-NEXT: buffer_load_b128 v[44:47], v60, s[4:7], 0 offen offset:176
+; SDAG-GFX1100-NEXT: buffer_load_b128 v[48:51], v60, s[4:7], 0 offen offset:192
+; SDAG-GFX1100-NEXT: buffer_load_b128 v[52:55], v60, s[4:7], 0 offen offset:208
+; SDAG-GFX1100-NEXT: buffer_load_b128 v[56:59], v60, s[4:7], 0 offen offset:224
+; SDAG-GFX1100-NEXT: buffer_load_b128 v[60:63], v60, s[4:7], 0 offen offset:240
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX1100-NEXT: buffer_store_b128 v[1:4], v65, s[12:15], 0 offen
+; SDAG-GFX1100-NEXT: buffer_store_b128 v[0:3], v64, s[12:15], 0 offen
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(14)
-; SDAG-GFX1100-NEXT: buffer_store_b128 v[5:8], v65, s[12:15], 0 offen offset:16
+; SDAG-GFX1100-NEXT: buffer_store_b128 v[4:7], v64, s[12:15], 0 offen offset:16
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(13)
-; SDAG-GFX1100-NEXT: buffer_store_b128 v[9:12], v65, s[12:15], 0 offen offset:32
+; SDAG-GFX1100-NEXT: buffer_store_b128 v[8:11], v64, s[12:15], 0 offen offset:32
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(12)
-; SDAG-GFX1100-NEXT: buffer_store_b128 v[13:16], v65, s[12:15], 0 offen offset:48
+; SDAG-GFX1100-NEXT: buffer_store_b128 v[12:15], v64, s[12:15], 0 offen offset:48
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(11)
-; SDAG-GFX1100-NEXT: buffer_store_b128 v[17:20], v65, s[12:15], 0 offen offset:64
+; SDAG-GFX1100-NEXT: buffer_store_b128 v[16:19], v64, s[12:15], 0 offen offset:64
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(10)
-; SDAG-GFX1100-NEXT: buffer_store_b128 v[21:24], v65, s[12:15], 0 offen offset:80
+; SDAG-GFX1100-NEXT: buffer_store_b128 v[20:23], v64, s[12:15], 0 offen offset:80
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(9)
-; SDAG-GFX1100-NEXT: buffer_store_b128 v[25:28], v65, s[12:15], 0 offen offset:96
+; SDAG-GFX1100-NEXT: buffer_store_b128 v[24:27], v64, s[12:15], 0 offen offset:96
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(8)
-; SDAG-GFX1100-NEXT: buffer_store_b128 v[29:32], v65, s[12:15], 0 offen offset:112
+; SDAG-GFX1100-NEXT: buffer_store_b128 v[28:31], v64, s[12:15], 0 offen offset:112
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(7)
-; SDAG-GFX1100-NEXT: buffer_store_b128 v[33:36], v65, s[12:15], 0 offen offset:128
+; SDAG-GFX1100-NEXT: buffer_store_b128 v[32:35], v64, s[12:15], 0 offen offset:128
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(6)
-; SDAG-GFX1100-NEXT: buffer_store_b128 v[37:40], v65, s[12:15], 0 offen offset:144
+; SDAG-GFX1100-NEXT: buffer_store_b128 v[36:39], v64, s[12:15], 0 offen offset:144
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(5)
-; SDAG-GFX1100-NEXT: buffer_store_b128 v[41:44], v65, s[12:15], 0 offen offset:160
+; SDAG-GFX1100-NEXT: buffer_store_b128 v[40:43], v64, s[12:15], 0 offen offset:160
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(4)
-; SDAG-GFX1100-NEXT: buffer_store_b128 v[45:48], v65, s[12:15], 0 offen offset:176
+; SDAG-GFX1100-NEXT: buffer_store_b128 v[44:47], v64, s[12:15], 0 offen offset:176
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(3)
-; SDAG-GFX1100-NEXT: buffer_store_b128 v[49:52], v65, s[12:15], 0 offen offset:192
+; SDAG-GFX1100-NEXT: buffer_store_b128 v[48:51], v64, s[12:15], 0 offen offset:192
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(2)
-; SDAG-GFX1100-NEXT: buffer_store_b128 v[53:56], v65, s[12:15], 0 offen offset:208
+; SDAG-GFX1100-NEXT: buffer_store_b128 v[52:55], v64, s[12:15], 0 offen offset:208
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(1)
-; SDAG-GFX1100-NEXT: buffer_store_b128 v[57:60], v65, s[12:15], 0 offen offset:224
+; SDAG-GFX1100-NEXT: buffer_store_b128 v[56:59], v64, s[12:15], 0 offen offset:224
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0)
-; SDAG-GFX1100-NEXT: buffer_store_b128 v[61:64], v65, s[12:15], 0 offen offset:240
-; SDAG-GFX1100-NEXT: s_cbranch_vccnz .LBB1_1
+; SDAG-GFX1100-NEXT: buffer_store_b128 v[60:63], v64, s[12:15], 0 offen offset:240
+; SDAG-GFX1100-NEXT: s_cbranch_scc1 .LBB1_1
; SDAG-GFX1100-NEXT: ; %bb.2: ; %memcpy-split
; SDAG-GFX1100-NEXT: s_endpgm
;
@@ -957,52 +981,50 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX942-NEXT: s_mov_b32 s2, s7
; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[2:3]
-; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s16
+; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, 0x100
+; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16
; GISEL-GFX942-NEXT: .LBB1_1: ; %load-store-loop
; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GISEL-GFX942-NEXT: v_add_u32_e32 v1, s0, v0
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[8:11], 0 offen
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[8:11], 0 offen offset:16
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:32
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[8:11], 0 offen offset:48
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[8:11], 0 offen offset:64
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[8:11], 0 offen offset:80
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v1, s[8:11], 0 offen offset:96
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v1, s[8:11], 0 offen offset:112
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v1, s[8:11], 0 offen offset:128
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v1, s[8:11], 0 offen offset:144
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v1, s[8:11], 0 offen offset:160
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v1, s[8:11], 0 offen offset:176
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[8:11], 0 offen offset:192
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[8:11], 0 offen offset:208
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[8:11], 0 offen offset:224
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[8:11], 0 offen offset:240
-; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0
-; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
-; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1
-; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1
-; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec
+; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v62, s[8:11], 0 offen offset:48
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v62, s[8:11], 0 offen offset:64
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v62, s[8:11], 0 offen offset:80
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v62, s[8:11], 0 offen offset:96
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v62, s[8:11], 0 offen offset:112
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v62, s[8:11], 0 offen offset:128
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v62, s[8:11], 0 offen offset:144
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v62, s[8:11], 0 offen offset:160
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v62, s[8:11], 0 offen offset:176
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240
+; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
+; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
+; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
-; GISEL-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[4:7], 0 offen offset:16
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[4:7], 0 offen offset:32
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[4:7], 0 offen offset:48
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[4:7], 0 offen offset:64
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[4:7], 0 offen offset:80
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v62, s[4:7], 0 offen offset:96
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v62, s[4:7], 0 offen offset:112
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v62, s[4:7], 0 offen offset:128
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v62, s[4:7], 0 offen offset:144
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v62, s[4:7], 0 offen offset:160
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v62, s[4:7], 0 offen offset:176
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[4:7], 0 offen offset:192
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[4:7], 0 offen offset:208
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[4:7], 0 offen offset:224
-; GISEL-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
+; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
+; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:240
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split
; GISEL-GFX942-NEXT: s_endpgm
@@ -1037,8 +1059,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0
; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v65, s8, v0
-; GISEL-GFX1100-NEXT: v_add_co_u32 v0, s1, 0x100, v0
-; GISEL-GFX1100-NEXT: s_xor_b32 s1, s1, -1
+; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v0, 0x100, v0
; GISEL-GFX1100-NEXT: s_clause 0xf
; GISEL-GFX1100-NEXT: buffer_load_b128 v[1:4], v61, s[4:7], 0 offen
; GISEL-GFX1100-NEXT: buffer_load_b128 v[5:8], v61, s[4:7], 0 offen offset:16
@@ -1056,7 +1077,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX1100-NEXT: buffer_load_b128 v[53:56], v61, s[4:7], 0 offen offset:208
; GISEL-GFX1100-NEXT: buffer_load_b128 v[57:60], v61, s[4:7], 0 offen offset:224
; GISEL-GFX1100-NEXT: buffer_load_b128 v[61:64], v61, s[4:7], 0 offen offset:240
-; GISEL-GFX1100-NEXT: s_xor_b32 s1, s1, -1
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(15)
; GISEL-GFX1100-NEXT: buffer_store_b128 v[1:4], v65, s[12:15], 0 offen
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(14)
@@ -1089,7 +1109,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX1100-NEXT: buffer_store_b128 v[57:60], v65, s[12:15], 0 offen offset:224
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX1100-NEXT: buffer_store_b128 v[61:64], v65, s[12:15], 0 offen offset:240
-; GISEL-GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s1
+; GISEL-GFX1100-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x100, v0
; GISEL-GFX1100-NEXT: s_cbranch_vccnz .LBB1_1
; GISEL-GFX1100-NEXT: ; %bb.2: ; %memcpy-split
; GISEL-GFX1100-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index bc3d378..3aa3663 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -11,9 +11,9 @@
; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O0>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
-; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
-; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
define void @empty() {
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 65d0102..6e52125 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -232,15 +232,15 @@
; GCN-O1-NEXT: AMDGPU Preload Kernel Arguments
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments
+; GCN-O1-NEXT: Dominator Tree Construction
+; GCN-O1-NEXT: Natural Loop Information
+; GCN-O1-NEXT: CodeGen Prepare
; GCN-O1-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O1-NEXT: AMDGPU lower intrinsics
; GCN-O1-NEXT: CallGraph Construction
; GCN-O1-NEXT: Call Graph SCC Pass Manager
; GCN-O1-NEXT: DummyCGSCCPass
; GCN-O1-NEXT: FunctionPass Manager
-; GCN-O1-NEXT: Dominator Tree Construction
-; GCN-O1-NEXT: Natural Loop Information
-; GCN-O1-NEXT: CodeGen Prepare
; GCN-O1-NEXT: Lazy Value Information Analysis
; GCN-O1-NEXT: Lower SwitchInst's to branches
; GCN-O1-NEXT: Lower invoke and unwind, for unwindless code generators
@@ -533,21 +533,21 @@
; GCN-O1-OPTS-NEXT: AMDGPU Preload Kernel Arguments
; GCN-O1-OPTS-NEXT: FunctionPass Manager
; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments
+; GCN-O1-OPTS-NEXT: Dominator Tree Construction
+; GCN-O1-OPTS-NEXT: Natural Loop Information
+; GCN-O1-OPTS-NEXT: CodeGen Prepare
+; GCN-O1-OPTS-NEXT: Dominator Tree Construction
+; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl)
+; GCN-O1-OPTS-NEXT: Function Alias Analysis Results
+; GCN-O1-OPTS-NEXT: Natural Loop Information
+; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis
+; GCN-O1-OPTS-NEXT: GPU Load and Store Vectorizer
; GCN-O1-OPTS-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O1-OPTS-NEXT: AMDGPU lower intrinsics
; GCN-O1-OPTS-NEXT: CallGraph Construction
; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager
; GCN-O1-OPTS-NEXT: DummyCGSCCPass
; GCN-O1-OPTS-NEXT: FunctionPass Manager
-; GCN-O1-OPTS-NEXT: Dominator Tree Construction
-; GCN-O1-OPTS-NEXT: Natural Loop Information
-; GCN-O1-OPTS-NEXT: CodeGen Prepare
-; GCN-O1-OPTS-NEXT: Dominator Tree Construction
-; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl)
-; GCN-O1-OPTS-NEXT: Function Alias Analysis Results
-; GCN-O1-OPTS-NEXT: Natural Loop Information
-; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis
-; GCN-O1-OPTS-NEXT: GPU Load and Store Vectorizer
; GCN-O1-OPTS-NEXT: Lazy Value Information Analysis
; GCN-O1-OPTS-NEXT: Lower SwitchInst's to branches
; GCN-O1-OPTS-NEXT: Lower invoke and unwind, for unwindless code generators
@@ -852,21 +852,21 @@
; GCN-O2-NEXT: AMDGPU Preload Kernel Arguments
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments
+; GCN-O2-NEXT: Dominator Tree Construction
+; GCN-O2-NEXT: Natural Loop Information
+; GCN-O2-NEXT: CodeGen Prepare
+; GCN-O2-NEXT: Dominator Tree Construction
+; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl)
+; GCN-O2-NEXT: Function Alias Analysis Results
+; GCN-O2-NEXT: Natural Loop Information
+; GCN-O2-NEXT: Scalar Evolution Analysis
+; GCN-O2-NEXT: GPU Load and Store Vectorizer
; GCN-O2-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O2-NEXT: AMDGPU lower intrinsics
; GCN-O2-NEXT: CallGraph Construction
; GCN-O2-NEXT: Call Graph SCC Pass Manager
; GCN-O2-NEXT: DummyCGSCCPass
; GCN-O2-NEXT: FunctionPass Manager
-; GCN-O2-NEXT: Dominator Tree Construction
-; GCN-O2-NEXT: Natural Loop Information
-; GCN-O2-NEXT: CodeGen Prepare
-; GCN-O2-NEXT: Dominator Tree Construction
-; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl)
-; GCN-O2-NEXT: Function Alias Analysis Results
-; GCN-O2-NEXT: Natural Loop Information
-; GCN-O2-NEXT: Scalar Evolution Analysis
-; GCN-O2-NEXT: GPU Load and Store Vectorizer
; GCN-O2-NEXT: Lazy Value Information Analysis
; GCN-O2-NEXT: Lower SwitchInst's to branches
; GCN-O2-NEXT: Lower invoke and unwind, for unwindless code generators
@@ -1186,21 +1186,21 @@
; GCN-O3-NEXT: AMDGPU Preload Kernel Arguments
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments
+; GCN-O3-NEXT: Dominator Tree Construction
+; GCN-O3-NEXT: Natural Loop Information
+; GCN-O3-NEXT: CodeGen Prepare
+; GCN-O3-NEXT: Dominator Tree Construction
+; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)
+; GCN-O3-NEXT: Function Alias Analysis Results
+; GCN-O3-NEXT: Natural Loop Information
+; GCN-O3-NEXT: Scalar Evolution Analysis
+; GCN-O3-NEXT: GPU Load and Store Vectorizer
; GCN-O3-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O3-NEXT: AMDGPU lower intrinsics
; GCN-O3-NEXT: CallGraph Construction
; GCN-O3-NEXT: Call Graph SCC Pass Manager
; GCN-O3-NEXT: DummyCGSCCPass
; GCN-O3-NEXT: FunctionPass Manager
-; GCN-O3-NEXT: Dominator Tree Construction
-; GCN-O3-NEXT: Natural Loop Information
-; GCN-O3-NEXT: CodeGen Prepare
-; GCN-O3-NEXT: Dominator Tree Construction
-; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)
-; GCN-O3-NEXT: Function Alias Analysis Results
-; GCN-O3-NEXT: Natural Loop Information
-; GCN-O3-NEXT: Scalar Evolution Analysis
-; GCN-O3-NEXT: GPU Load and Store Vectorizer
; GCN-O3-NEXT: Lazy Value Information Analysis
; GCN-O3-NEXT: Lower SwitchInst's to branches
; GCN-O3-NEXT: Lower invoke and unwind, for unwindless code generators
diff --git a/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll b/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
index 5d5aad7..566eb1e 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
@@ -7,16 +7,12 @@
@gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
-; GCN-LABEL: unreachable:
-; Function info:
-; codeLenInByte = 4
define internal fastcc void @unreachable() {
%fptr = load ptr, ptr addrspace(4) @gv.fptr0
call void %fptr()
unreachable
}
-
; GCN-LABEL: entry:
; GCN-NOT: s_swappc_b64
; GCN: s_endpgm
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressU.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressU.ll
index 288dea0..b043ea1 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressU.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressU.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
!dx.rootsignatures = !{!2} ; list of function/root signature pairs
!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
!3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 666, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 666, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressV.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressV.ll
index e9abcf9..8219ffd 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressV.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressV.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
!dx.rootsignatures = !{!2} ; list of function/root signature pairs
!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
!3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 666, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 666, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressW.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressW.ll
index 238f488..31d8dd1 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressW.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressW.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
!dx.rootsignatures = !{!2} ; list of function/root signature pairs
!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
!3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 666, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 666, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-BorderColor.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-BorderColor.ll
index 8dc69eb..2bb4af5 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-BorderColor.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-BorderColor.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
!dx.rootsignatures = !{!2} ; list of function/root signature pairs
!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
!3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 666, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 666, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ComparisonFunc.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ComparisonFunc.ll
index b2c8faf..62fda73 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ComparisonFunc.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ComparisonFunc.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
!dx.rootsignatures = !{!2} ; list of function/root signature pairs
!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
!3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 666, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 666, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Filter.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Filter.ll
index 758d262..7e8de14 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Filter.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Filter.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
!dx.rootsignatures = !{!2} ; list of function/root signature pairs
!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
!3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 45, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 45, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Flag.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Flag.ll
new file mode 100644
index 0000000..8f7ef88
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Flag.ll
@@ -0,0 +1,19 @@
+; RUN: not opt -passes='print<dxil-root-signature>' %s -S -o - 2>&1 | FileCheck %s
+
+
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+; CHECK: error: Invalid value for Static Sampler Flag: 4
+; CHECK-NOT: Root Signature Definitions
+
+define void @main() #0 {
+entry:
+ ret void
+}
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+
+!dx.rootsignatures = !{!2} ; list of function/root signature pairs
+!2 = !{ ptr @main, !3, i32 3 } ; function, root signature
+!3 = !{ !5 } ; list of root signature elements
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 4 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MaxAnisotropy.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MaxAnisotropy.ll
index 47d4b52..312e769 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MaxAnisotropy.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MaxAnisotropy.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
!dx.rootsignatures = !{!2} ; list of function/root signature pairs
!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
!3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 666, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 666, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MaxLod.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MaxLod.ll
index 855e0c0..80fd208 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MaxLod.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MaxLod.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
!dx.rootsignatures = !{!2} ; list of function/root signature pairs
!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
!3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 0x7FF8000000000000, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 0x7FF8000000000000, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MinLod.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MinLod.ll
index 812749b..5daaf69 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MinLod.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MinLod.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
!dx.rootsignatures = !{!2} ; list of function/root signature pairs
!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
!3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float 0x7FF8000000000000, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float 0x7FF8000000000000, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MinLopBias.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MinLopBias.ll
index 6898aec..423987b 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MinLopBias.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MinLopBias.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
!dx.rootsignatures = !{!2} ; list of function/root signature pairs
!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
!3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 6.660000e+02, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 6.660000e+02, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-RegisterSpace.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-RegisterSpace.ll
index dc6ee42..af630dc 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-RegisterSpace.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-RegisterSpace.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
!dx.rootsignatures = !{!2} ; list of function/root signature pairs
!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
!3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 4294967280, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 4294967280, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ShaderRegister.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ShaderRegister.ll
index 6cee1dd9..bd752f0 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ShaderRegister.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ShaderRegister.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
!dx.rootsignatures = !{!2} ; list of function/root signature pairs
!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
!3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 4294967295, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 4294967295, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ShaderVisibility.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ShaderVisibility.ll
index fa5bf12..ca0c02d 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ShaderVisibility.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ShaderVisibility.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
!dx.rootsignatures = !{!2} ; list of function/root signature pairs
!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
!3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 666 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 666, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers.ll
index 1dd470d..77c5c7a 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers.ll
@@ -15,7 +15,7 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
!dx.rootsignatures = !{!2} ; list of function/root signature pairs
!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
!3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
; DXC: - Name: RTS0
; DXC-NEXT: Size: 76
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers_V3.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers_V3.ll
new file mode 100644
index 0000000..7e56f04
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers_V3.ll
@@ -0,0 +1,42 @@
+; RUN: opt %s -dxil-embed -dxil-globals -S -o - | FileCheck %s
+; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC
+
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+; CHECK: @dx.rts0 = private constant [248 x i8] c"{{.*}}", section "RTS0", align 4
+
+define void @main() #0 {
+entry:
+ ret void
+}
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+
+!dx.rootsignatures = !{!2} ; list of function/root signature pairs
+!2 = !{ ptr @main, !3, i32 3 } ; function, root signature
+!3 = !{ !5, !6, !7, !8 } ; list of root signature elements
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 1 }
+!6 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 43, i32 0, i32 0, i32 2 }
+!7 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 44, i32 0, i32 0, i32 0 }
+!8 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 45, i32 0, i32 0, i32 3 }
+
+; DXC: - Name: RTS0
+; DXC-NEXT: Size: 248
+; DXC-NEXT: RootSignature:
+; DXC-NEXT: Version: 3
+; DXC-NEXT: NumRootParameters: 0
+; DXC-NEXT: RootParametersOffset: 24
+; DXC-NEXT: NumStaticSamplers: 4
+; DXC-NEXT: StaticSamplersOffset: 24
+; DXC-NEXT: Parameters: []
+; DXC-NEXT: Samplers:
+; DXC-LABEL: ShaderRegister: 42
+; DXC: SAMPLER_FLAG_UINT_BORDER_COLOR: true
+; DXC-LABEL: ShaderRegister: 43
+; DXC: SAMPLER_FLAG_NON_NORMALIZED_COORDINATES: true
+; DXC-LABEL: ShaderRegister: 44
+; DXC-NOT: SAMPLER_FLAG_NON_NORMALIZED_COORDINATES:
+; DXC-NOT: SAMPLER_FLAG_UINT_BORDER_COLOR:
+; DXC-LABEL: ShaderRegister: 45
+; DXC: SAMPLER_FLAG_UINT_BORDER_COLOR: true
+; DXC-NEXT: SAMPLER_FLAG_NON_NORMALIZED_COORDINATES: true
diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-sampler.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-sampler.ll
index c244095..b68606d 100644
--- a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-sampler.ll
+++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-sampler.ll
@@ -10,6 +10,6 @@ entry:
!0 = !{ptr @CSMain, !1, i32 2}
!1 = !{!2, !3}
-!2 = !{ !"StaticSampler", i32 5, i32 4, i32 5, i32 3, float 0x3FF7CCCCC0000000, i32 10, i32 2, i32 1, float -1.270000e+02, float 1.220000e+02, i32 42, i32 0, i32 0 }
+!2 = !{ !"StaticSampler", i32 5, i32 4, i32 5, i32 3, float 0x3FF7CCCCC0000000, i32 10, i32 2, i32 1, float -1.270000e+02, float 1.220000e+02, i32 42, i32 0, i32 0, i32 0 }
!3 = !{!"DescriptorTable", i32 0, !4}
!4 = !{!"Sampler", i32 1, i32 42, i32 0, i32 -1, i32 0}
diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-static-sampler-range.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-static-sampler-range.ll
index 9ac02ebb..7c836e2 100644
--- a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-static-sampler-range.ll
+++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-static-sampler-range.ll
@@ -10,5 +10,5 @@ entry:
!0 = !{ptr @CSMain, !1, i32 2}
!1 = !{!2, !3}
-!2 = !{ !"StaticSampler", i32 5, i32 4, i32 5, i32 3, float 0x3FF7CCCCC0000000, i32 10, i32 2, i32 1, float -1.270000e+02, float 1.220000e+02, i32 42, i32 0, i32 0 }
-!3 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!2 = !{ !"StaticSampler", i32 5, i32 4, i32 5, i32 3, float 0x3FF7CCCCC0000000, i32 10, i32 2, i32 1, float -1.270000e+02, float 1.220000e+02, i32 42, i32 0, i32 0, i32 0 }
+!3 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
index 02825b2..19a1841 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
@@ -6018,3 +6018,39 @@ vector.latch: ; preds = %for.body419
for.cond.cleanup: ; preds = %vector.latch
ret void
}
+
+;; This is exactly like sink_add_splat except that the splat has operands
+;; which haven't been converted to undef.
+define void @sink_non_canonical_splat(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_non_canonical_splat:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lui a2, 1
+; CHECK-NEXT: add a2, a0, a2
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: .LBB131_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vadd.vx v8, v8, a1
+; CHECK-NEXT: vse32.v v8, (a0)
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: bne a0, a2, .LBB131_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: ret
+entry:
+ %broadcast.splatinsert = insertelement <4 x i32> zeroinitializer, i32 %x, i32 0
+ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %0 = getelementptr inbounds i32, ptr %a, i64 %index
+ %wide.load = load <4 x i32>, ptr %0, align 4
+ %1 = add <4 x i32> %wide.load, %broadcast.splat
+ store <4 x i32> %1, ptr %0, align 4
+ %index.next = add nuw i64 %index, 4
+ %2 = icmp eq i64 %index.next, 1024
+ br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret void
+}
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt
index 07dbbdd..94edf22 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt
@@ -720,10 +720,12 @@
# GFX1250: v_cvt_f32_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xe4,0x0a,0x7e]
0x01,0xe5,0x0a,0x7e
-# GFX1250: v_cvt_f32_bf16_e32 v5, v1.l ; encoding: [0x01,0xe5,0x0a,0x7e]
+# GFX1250-REAL16: v_cvt_f32_bf16_e32 v5, v1.l ; encoding: [0x01,0xe5,0x0a,0x7e]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e]
0x7f,0xe5,0x0a,0x7e
-# GFX1250: v_cvt_f32_bf16_e32 v5, v127.l ; encoding: [0x7f,0xe5,0x0a,0x7e]
+# GFX1250-REAL16: v_cvt_f32_bf16_e32 v5, v127.l ; encoding: [0x7f,0xe5,0x0a,0x7e]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e32 v5, v127 ; encoding: [0x7f,0xe5,0x0a,0x7e]
0x6b,0xe4,0x0a,0x7e
# GFX1250: v_cvt_f32_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xe4,0x0a,0x7e]
@@ -732,7 +734,8 @@
# GFX1250: v_cvt_f32_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xe4,0x0a,0x7e]
0x81,0xe5,0x0a,0x7e
-# GFX1250: v_cvt_f32_bf16_e32 v5, v1.h ; encoding: [0x81,0xe5,0x0a,0x7e]
+# GFX1250-REAL16: v_cvt_f32_bf16_e32 v5, v1.h ; encoding: [0x81,0xe5,0x0a,0x7e]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xe5,0x0a,0x7e]
0xff,0xf0,0x02,0x7e,0x34,0x12,0x00,0x00
# GFX1250-REAL16: v_cvt_f16_bf8_e32 v1.l, 0x1234 ; encoding: [0xff,0xf0,0x02,0x7e,0x34,0x12,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt
index c12ecb8..93286ca 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt
@@ -615,49 +615,64 @@
# GFX1250-REAL16: v_cos_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7f,0x81,0x1b,0x00,0xff]
0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30
-# GFX1250: v_cvt_f32_bf16_dpp v127, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v127, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
0xfa,0xe4,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
0xfa,0xe4,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x41,0x01,0xff]
0xfa,0xe4,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x40,0x01,0xff]
0xfa,0xe4,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x21,0x01,0xff]
0xfa,0xe4,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
0xfa,0xe4,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x50,0x01,0xff]
0xfa,0xe4,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
0xfa,0xe4,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x01,0x01,0xff]
0xfa,0xe4,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
0xfa,0xe4,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x11,0x01,0xff]
0xfa,0xe4,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
0xfa,0xe4,0x0a,0x7e,0x01,0x60,0x09,0x13
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x60,0x09,0x13]
0xfa,0xe4,0x0a,0x7e,0x81,0x1b,0x00,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x81,0x1b,0x00,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x81,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x81,0x1b,0x00,0xff]
0xfa,0xf0,0x02,0x7e,0x02,0x39,0x00,0xff
# GFX1250-REAL16: v_cvt_f16_bf8_dpp v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf0,0x02,0x7e,0x02,0x39,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt
index fa7b940..fb3f1b2 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt
@@ -165,16 +165,20 @@
# GFX1250-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_dpp v127, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v127, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
0xea,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
0xe9,0xe4,0x0a,0x7e,0x81,0x77,0x39,0x05
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x81,0x77,0x39,0x05]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x81,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x81,0x77,0x39,0x05]
0xe9,0xf0,0x02,0x7e,0x02,0x77,0x39,0x05
# GFX1250-REAL16: v_cvt_f16_bf8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf0,0x02,0x7e,0x02,0x77,0x39,0x05]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
index c15e8d4..ab9b48f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
@@ -616,6 +616,45 @@ exit:
ret double %red.next
}
+define i32 @test_ptr_iv_load_used_by_other_load(ptr %start, ptr %end) {
+; CHECK-LABEL: define i32 @test_ptr_iv_load_used_by_other_load(
+; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ null, %[[ENTRY]] ]
+; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[RED_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[IV]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[TMP0]], align 8
+; CHECK-NEXT: [[C:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT: [[C_EXT:%.*]] = zext i1 [[C]] to i32
+; CHECK-NEXT: [[RED_NEXT]] = or i32 [[RED]], [[C_EXT]]
+; CHECK-NEXT: [[IV_NEXT]] = getelementptr nusw i8, ptr [[IV]], i64 32
+; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[IV]], [[END]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[RED_LCSSA:%.*]] = phi i32 [ [[RED]], %[[LOOP]] ]
+; CHECK-NEXT: ret i32 [[RED_LCSSA]]
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %iv = phi ptr [ %iv.next, %loop ], [ null, %entry ]
+ %red = phi i32 [ %red.next, %loop ], [ 0, %entry ]
+ %0 = load ptr, ptr %iv, align 8
+ %1 = load i8, ptr %0, align 8
+ %c = icmp ne i8 %1, 0
+ %c.ext = zext i1 %c to i32
+ %red.next = or i32 %red, %c.ext
+ %iv.next = getelementptr nusw i8, ptr %iv, i64 32
+ %ec = icmp eq ptr %iv, %end
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret i32 %red
+}
+
attributes #0 = { "target-cpu"="neoverse-512tvb" }
!0 = !{!1, !2, i64 0}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
new file mode 100644
index 0000000..8784873
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
@@ -0,0 +1,460 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6
+; RUN: opt -p loop-vectorize -mtriple=x86_64-linux-gnu -S %s | FileCheck --check-prefix=I64 %s
+; RUN: opt -p loop-vectorize -mtriple=i386-pc-linux-gnu -S %s | FileCheck --check-prefix=I32 %s
+
+
+define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
+; I64-LABEL: define void @test_store_initially_interleave(
+; I64-SAME: i32 [[N:%.*]], ptr noalias [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; I64-NEXT: [[ITER_CHECK:.*:]]
+; I64-NEXT: [[TMP4:%.*]] = add i32 [[N]], 1
+; I64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP4]], 4
+; I64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; I64: [[VECTOR_SCEVCHECK]]:
+; I64-NEXT: [[TMP1:%.*]] = icmp slt i32 [[N]], 0
+; I64-NEXT: br i1 [[TMP1]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; I64: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; I64-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i32 [[TMP4]], 16
+; I64-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; I64: [[VECTOR_PH]]:
+; I64-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP4]], 16
+; I64-NEXT: [[TMP2:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; I64-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 16, i32 [[N_MOD_VF]]
+; I64-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP4]], [[TMP3]]
+; I64-NEXT: br label %[[VECTOR_BODY:.*]]
+; I64: [[VECTOR_BODY]]:
+; I64-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I64-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I64-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; I64-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
+; I64-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
+; I64-NEXT: [[IV:%.*]] = add i32 [[INDEX]], 0
+; I64-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1
+; I64-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 2
+; I64-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 3
+; I64-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 4
+; I64-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 5
+; I64-NEXT: [[TMP10:%.*]] = add i32 [[INDEX]], 6
+; I64-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 7
+; I64-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 8
+; I64-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 9
+; I64-NEXT: [[TMP14:%.*]] = add i32 [[INDEX]], 10
+; I64-NEXT: [[TMP15:%.*]] = add i32 [[INDEX]], 11
+; I64-NEXT: [[TMP16:%.*]] = add i32 [[INDEX]], 12
+; I64-NEXT: [[TMP17:%.*]] = add i32 [[INDEX]], 13
+; I64-NEXT: [[TMP18:%.*]] = add i32 [[INDEX]], 14
+; I64-NEXT: [[TMP19:%.*]] = add i32 [[INDEX]], 15
+; I64-NEXT: [[TMP20:%.*]] = uitofp <4 x i32> [[VEC_IND]] to <4 x double>
+; I64-NEXT: [[TMP21:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double>
+; I64-NEXT: [[TMP22:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double>
+; I64-NEXT: [[TMP23:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double>
+; I64-NEXT: [[ADD_PTR_I:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[IV]]
+; I64-NEXT: [[TMP25:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]]
+; I64-NEXT: [[TMP26:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP6]]
+; I64-NEXT: [[TMP27:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP7]]
+; I64-NEXT: [[TMP28:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP8]]
+; I64-NEXT: [[TMP29:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP9]]
+; I64-NEXT: [[TMP30:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP10]]
+; I64-NEXT: [[TMP31:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP11]]
+; I64-NEXT: [[TMP32:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP12]]
+; I64-NEXT: [[TMP33:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP13]]
+; I64-NEXT: [[TMP34:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP14]]
+; I64-NEXT: [[TMP35:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP15]]
+; I64-NEXT: [[TMP36:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP16]]
+; I64-NEXT: [[TMP37:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP17]]
+; I64-NEXT: [[TMP38:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP18]]
+; I64-NEXT: [[TMP39:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP19]]
+; I64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADD_PTR_I]], align 4
+; I64-NEXT: [[TMP41:%.*]] = load ptr, ptr [[TMP25]], align 4
+; I64-NEXT: [[TMP42:%.*]] = load ptr, ptr [[TMP26]], align 4
+; I64-NEXT: [[TMP43:%.*]] = load ptr, ptr [[TMP27]], align 4
+; I64-NEXT: [[TMP44:%.*]] = load ptr, ptr [[TMP28]], align 4
+; I64-NEXT: [[TMP45:%.*]] = load ptr, ptr [[TMP29]], align 4
+; I64-NEXT: [[TMP46:%.*]] = load ptr, ptr [[TMP30]], align 4
+; I64-NEXT: [[TMP47:%.*]] = load ptr, ptr [[TMP31]], align 4
+; I64-NEXT: [[TMP48:%.*]] = load ptr, ptr [[TMP32]], align 4
+; I64-NEXT: [[TMP49:%.*]] = load ptr, ptr [[TMP33]], align 4
+; I64-NEXT: [[TMP50:%.*]] = load ptr, ptr [[TMP34]], align 4
+; I64-NEXT: [[TMP51:%.*]] = load ptr, ptr [[TMP35]], align 4
+; I64-NEXT: [[TMP52:%.*]] = load ptr, ptr [[TMP36]], align 4
+; I64-NEXT: [[TMP53:%.*]] = load ptr, ptr [[TMP37]], align 4
+; I64-NEXT: [[TMP54:%.*]] = load ptr, ptr [[TMP38]], align 4
+; I64-NEXT: [[TMP55:%.*]] = load ptr, ptr [[TMP39]], align 4
+; I64-NEXT: [[CONV:%.*]] = extractelement <4 x double> [[TMP20]], i32 0
+; I64-NEXT: store double [[CONV]], ptr [[TMP0]], align 4
+; I64-NEXT: [[TMP57:%.*]] = extractelement <4 x double> [[TMP20]], i32 1
+; I64-NEXT: store double [[TMP57]], ptr [[TMP41]], align 4
+; I64-NEXT: [[TMP58:%.*]] = extractelement <4 x double> [[TMP20]], i32 2
+; I64-NEXT: store double [[TMP58]], ptr [[TMP42]], align 4
+; I64-NEXT: [[TMP59:%.*]] = extractelement <4 x double> [[TMP20]], i32 3
+; I64-NEXT: store double [[TMP59]], ptr [[TMP43]], align 4
+; I64-NEXT: [[TMP60:%.*]] = extractelement <4 x double> [[TMP21]], i32 0
+; I64-NEXT: store double [[TMP60]], ptr [[TMP44]], align 4
+; I64-NEXT: [[TMP61:%.*]] = extractelement <4 x double> [[TMP21]], i32 1
+; I64-NEXT: store double [[TMP61]], ptr [[TMP45]], align 4
+; I64-NEXT: [[TMP62:%.*]] = extractelement <4 x double> [[TMP21]], i32 2
+; I64-NEXT: store double [[TMP62]], ptr [[TMP46]], align 4
+; I64-NEXT: [[TMP63:%.*]] = extractelement <4 x double> [[TMP21]], i32 3
+; I64-NEXT: store double [[TMP63]], ptr [[TMP47]], align 4
+; I64-NEXT: [[TMP64:%.*]] = extractelement <4 x double> [[TMP22]], i32 0
+; I64-NEXT: store double [[TMP64]], ptr [[TMP48]], align 4
+; I64-NEXT: [[TMP65:%.*]] = extractelement <4 x double> [[TMP22]], i32 1
+; I64-NEXT: store double [[TMP65]], ptr [[TMP49]], align 4
+; I64-NEXT: [[TMP66:%.*]] = extractelement <4 x double> [[TMP22]], i32 2
+; I64-NEXT: store double [[TMP66]], ptr [[TMP50]], align 4
+; I64-NEXT: [[TMP67:%.*]] = extractelement <4 x double> [[TMP22]], i32 3
+; I64-NEXT: store double [[TMP67]], ptr [[TMP51]], align 4
+; I64-NEXT: [[TMP68:%.*]] = extractelement <4 x double> [[TMP23]], i32 0
+; I64-NEXT: store double [[TMP68]], ptr [[TMP52]], align 4
+; I64-NEXT: [[TMP69:%.*]] = extractelement <4 x double> [[TMP23]], i32 1
+; I64-NEXT: store double [[TMP69]], ptr [[TMP53]], align 4
+; I64-NEXT: [[TMP70:%.*]] = extractelement <4 x double> [[TMP23]], i32 2
+; I64-NEXT: store double [[TMP70]], ptr [[TMP54]], align 4
+; I64-NEXT: [[TMP71:%.*]] = extractelement <4 x double> [[TMP23]], i32 3
+; I64-NEXT: store double [[TMP71]], ptr [[TMP55]], align 4
+; I64-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; I64-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
+; I64-NEXT: [[TMP72:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; I64-NEXT: br i1 [[TMP72]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; I64: [[MIDDLE_BLOCK]]:
+; I64-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; I64: [[VEC_EPILOG_ITER_CHECK]]:
+; I64-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP3]], 4
+; I64-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
+; I64: [[VEC_EPILOG_PH]]:
+; I64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; I64-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[TMP4]], 4
+; I64-NEXT: [[TMP73:%.*]] = icmp eq i32 [[N_MOD_VF2]], 0
+; I64-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], i32 4, i32 [[N_MOD_VF2]]
+; I64-NEXT: [[N_VEC3:%.*]] = sub i32 [[TMP4]], [[TMP74]]
+; I64-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
+; I64-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; I64-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; I64-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; I64: [[VEC_EPILOG_VECTOR_BODY]]:
+; I64-NEXT: [[INDEX4:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; I64-NEXT: [[VEC_IND5:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT7:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; I64-NEXT: [[TMP75:%.*]] = add i32 [[INDEX4]], 0
+; I64-NEXT: [[TMP76:%.*]] = add i32 [[INDEX4]], 1
+; I64-NEXT: [[TMP77:%.*]] = add i32 [[INDEX4]], 2
+; I64-NEXT: [[TMP78:%.*]] = add i32 [[INDEX4]], 3
+; I64-NEXT: [[TMP79:%.*]] = uitofp <4 x i32> [[VEC_IND5]] to <4 x double>
+; I64-NEXT: [[TMP80:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]]
+; I64-NEXT: [[TMP81:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]]
+; I64-NEXT: [[TMP82:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]]
+; I64-NEXT: [[TMP83:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP78]]
+; I64-NEXT: [[TMP84:%.*]] = load ptr, ptr [[TMP80]], align 4
+; I64-NEXT: [[TMP85:%.*]] = load ptr, ptr [[TMP81]], align 4
+; I64-NEXT: [[TMP86:%.*]] = load ptr, ptr [[TMP82]], align 4
+; I64-NEXT: [[TMP87:%.*]] = load ptr, ptr [[TMP83]], align 4
+; I64-NEXT: [[TMP88:%.*]] = extractelement <4 x double> [[TMP79]], i32 0
+; I64-NEXT: store double [[TMP88]], ptr [[TMP84]], align 4
+; I64-NEXT: [[TMP89:%.*]] = extractelement <4 x double> [[TMP79]], i32 1
+; I64-NEXT: store double [[TMP89]], ptr [[TMP85]], align 4
+; I64-NEXT: [[TMP90:%.*]] = extractelement <4 x double> [[TMP79]], i32 2
+; I64-NEXT: store double [[TMP90]], ptr [[TMP86]], align 4
+; I64-NEXT: [[TMP91:%.*]] = extractelement <4 x double> [[TMP79]], i32 3
+; I64-NEXT: store double [[TMP91]], ptr [[TMP87]], align 4
+; I64-NEXT: [[INDEX_NEXT6]] = add nuw i32 [[INDEX4]], 4
+; I64-NEXT: [[VEC_IND_NEXT7]] = add <4 x i32> [[VEC_IND5]], splat (i32 4)
+; I64-NEXT: [[TMP92:%.*]] = icmp eq i32 [[INDEX_NEXT6]], [[N_VEC3]]
+; I64-NEXT: br i1 [[TMP92]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; I64: [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; I64-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]]
+; I64: [[VEC_EPILOG_SCALAR_PH]]:
+;
+; I32-LABEL: define void @test_store_initially_interleave(
+; I32-SAME: i32 [[N:%.*]], ptr noalias [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; I32-NEXT: [[ENTRY:.*:]]
+; I32-NEXT: [[TMP0:%.*]] = add i32 [[N]], 1
+; I32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP0]], 4
+; I32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; I32: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; I32-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i32 [[TMP0]], 16
+; I32-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; I32: [[VECTOR_PH]]:
+; I32-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 16
+; I32-NEXT: [[TMP1:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; I32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 16, i32 [[N_MOD_VF]]
+; I32-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[TMP2]]
+; I32-NEXT: br label %[[VECTOR_BODY:.*]]
+; I32: [[VECTOR_BODY]]:
+; I32-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I32-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I32-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; I32-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
+; I32-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
+; I32-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0
+; I32-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 1
+; I32-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 2
+; I32-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 3
+; I32-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 4
+; I32-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 5
+; I32-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 6
+; I32-NEXT: [[TMP10:%.*]] = add i32 [[INDEX]], 7
+; I32-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 8
+; I32-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 9
+; I32-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 10
+; I32-NEXT: [[TMP14:%.*]] = add i32 [[INDEX]], 11
+; I32-NEXT: [[TMP40:%.*]] = add i32 [[INDEX]], 12
+; I32-NEXT: [[TMP41:%.*]] = add i32 [[INDEX]], 13
+; I32-NEXT: [[TMP42:%.*]] = add i32 [[INDEX]], 14
+; I32-NEXT: [[TMP43:%.*]] = add i32 [[INDEX]], 15
+; I32-NEXT: [[TMP44:%.*]] = uitofp <4 x i32> [[VEC_IND]] to <4 x double>
+; I32-NEXT: [[TMP45:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double>
+; I32-NEXT: [[TMP46:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double>
+; I32-NEXT: [[TMP55:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double>
+; I32-NEXT: [[TMP15:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP3]]
+; I32-NEXT: [[TMP16:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP4]]
+; I32-NEXT: [[TMP17:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]]
+; I32-NEXT: [[TMP18:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP6]]
+; I32-NEXT: [[TMP19:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP7]]
+; I32-NEXT: [[TMP20:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP8]]
+; I32-NEXT: [[TMP21:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP9]]
+; I32-NEXT: [[TMP22:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP10]]
+; I32-NEXT: [[TMP56:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP11]]
+; I32-NEXT: [[TMP57:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP12]]
+; I32-NEXT: [[TMP58:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP13]]
+; I32-NEXT: [[TMP59:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP14]]
+; I32-NEXT: [[TMP60:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP40]]
+; I32-NEXT: [[TMP61:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP41]]
+; I32-NEXT: [[TMP62:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP42]]
+; I32-NEXT: [[TMP71:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP43]]
+; I32-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP15]], align 4
+; I32-NEXT: [[TMP24:%.*]] = load ptr, ptr [[TMP16]], align 4
+; I32-NEXT: [[TMP25:%.*]] = load ptr, ptr [[TMP17]], align 4
+; I32-NEXT: [[TMP26:%.*]] = load ptr, ptr [[TMP18]], align 4
+; I32-NEXT: [[TMP27:%.*]] = load ptr, ptr [[TMP19]], align 4
+; I32-NEXT: [[TMP28:%.*]] = load ptr, ptr [[TMP20]], align 4
+; I32-NEXT: [[TMP29:%.*]] = load ptr, ptr [[TMP21]], align 4
+; I32-NEXT: [[TMP30:%.*]] = load ptr, ptr [[TMP22]], align 4
+; I32-NEXT: [[TMP47:%.*]] = load ptr, ptr [[TMP56]], align 4
+; I32-NEXT: [[TMP48:%.*]] = load ptr, ptr [[TMP57]], align 4
+; I32-NEXT: [[TMP49:%.*]] = load ptr, ptr [[TMP58]], align 4
+; I32-NEXT: [[TMP50:%.*]] = load ptr, ptr [[TMP59]], align 4
+; I32-NEXT: [[TMP51:%.*]] = load ptr, ptr [[TMP60]], align 4
+; I32-NEXT: [[TMP52:%.*]] = load ptr, ptr [[TMP61]], align 4
+; I32-NEXT: [[TMP53:%.*]] = load ptr, ptr [[TMP62]], align 4
+; I32-NEXT: [[TMP54:%.*]] = load ptr, ptr [[TMP71]], align 4
+; I32-NEXT: [[TMP31:%.*]] = extractelement <4 x double> [[TMP44]], i32 0
+; I32-NEXT: store double [[TMP31]], ptr [[TMP23]], align 4
+; I32-NEXT: [[TMP32:%.*]] = extractelement <4 x double> [[TMP44]], i32 1
+; I32-NEXT: store double [[TMP32]], ptr [[TMP24]], align 4
+; I32-NEXT: [[TMP33:%.*]] = extractelement <4 x double> [[TMP44]], i32 2
+; I32-NEXT: store double [[TMP33]], ptr [[TMP25]], align 4
+; I32-NEXT: [[TMP34:%.*]] = extractelement <4 x double> [[TMP44]], i32 3
+; I32-NEXT: store double [[TMP34]], ptr [[TMP26]], align 4
+; I32-NEXT: [[TMP35:%.*]] = extractelement <4 x double> [[TMP45]], i32 0
+; I32-NEXT: store double [[TMP35]], ptr [[TMP27]], align 4
+; I32-NEXT: [[TMP36:%.*]] = extractelement <4 x double> [[TMP45]], i32 1
+; I32-NEXT: store double [[TMP36]], ptr [[TMP28]], align 4
+; I32-NEXT: [[TMP37:%.*]] = extractelement <4 x double> [[TMP45]], i32 2
+; I32-NEXT: store double [[TMP37]], ptr [[TMP29]], align 4
+; I32-NEXT: [[TMP38:%.*]] = extractelement <4 x double> [[TMP45]], i32 3
+; I32-NEXT: store double [[TMP38]], ptr [[TMP30]], align 4
+; I32-NEXT: [[TMP63:%.*]] = extractelement <4 x double> [[TMP46]], i32 0
+; I32-NEXT: store double [[TMP63]], ptr [[TMP47]], align 4
+; I32-NEXT: [[TMP64:%.*]] = extractelement <4 x double> [[TMP46]], i32 1
+; I32-NEXT: store double [[TMP64]], ptr [[TMP48]], align 4
+; I32-NEXT: [[TMP65:%.*]] = extractelement <4 x double> [[TMP46]], i32 2
+; I32-NEXT: store double [[TMP65]], ptr [[TMP49]], align 4
+; I32-NEXT: [[TMP66:%.*]] = extractelement <4 x double> [[TMP46]], i32 3
+; I32-NEXT: store double [[TMP66]], ptr [[TMP50]], align 4
+; I32-NEXT: [[TMP67:%.*]] = extractelement <4 x double> [[TMP55]], i32 0
+; I32-NEXT: store double [[TMP67]], ptr [[TMP51]], align 4
+; I32-NEXT: [[TMP68:%.*]] = extractelement <4 x double> [[TMP55]], i32 1
+; I32-NEXT: store double [[TMP68]], ptr [[TMP52]], align 4
+; I32-NEXT: [[TMP69:%.*]] = extractelement <4 x double> [[TMP55]], i32 2
+; I32-NEXT: store double [[TMP69]], ptr [[TMP53]], align 4
+; I32-NEXT: [[TMP70:%.*]] = extractelement <4 x double> [[TMP55]], i32 3
+; I32-NEXT: store double [[TMP70]], ptr [[TMP54]], align 4
+; I32-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; I32-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
+; I32-NEXT: [[TMP39:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; I32-NEXT: br i1 [[TMP39]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; I32: [[MIDDLE_BLOCK]]:
+; I32-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; I32: [[VEC_EPILOG_ITER_CHECK]]:
+; I32-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP2]], 4
+; I32-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
+; I32: [[VEC_EPILOG_PH]]:
+; I32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; I32-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[TMP0]], 4
+; I32-NEXT: [[TMP72:%.*]] = icmp eq i32 [[N_MOD_VF2]], 0
+; I32-NEXT: [[TMP73:%.*]] = select i1 [[TMP72]], i32 4, i32 [[N_MOD_VF2]]
+; I32-NEXT: [[N_VEC3:%.*]] = sub i32 [[TMP0]], [[TMP73]]
+; I32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
+; I32-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; I32-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; I32-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; I32: [[VEC_EPILOG_VECTOR_BODY]]:
+; I32-NEXT: [[INDEX4:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; I32-NEXT: [[VEC_IND5:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT7:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; I32-NEXT: [[TMP74:%.*]] = add i32 [[INDEX4]], 0
+; I32-NEXT: [[TMP75:%.*]] = add i32 [[INDEX4]], 1
+; I32-NEXT: [[TMP76:%.*]] = add i32 [[INDEX4]], 2
+; I32-NEXT: [[TMP77:%.*]] = add i32 [[INDEX4]], 3
+; I32-NEXT: [[TMP78:%.*]] = uitofp <4 x i32> [[VEC_IND5]] to <4 x double>
+; I32-NEXT: [[TMP79:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP74]]
+; I32-NEXT: [[TMP80:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]]
+; I32-NEXT: [[TMP81:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]]
+; I32-NEXT: [[TMP82:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]]
+; I32-NEXT: [[TMP83:%.*]] = load ptr, ptr [[TMP79]], align 4
+; I32-NEXT: [[TMP84:%.*]] = load ptr, ptr [[TMP80]], align 4
+; I32-NEXT: [[TMP85:%.*]] = load ptr, ptr [[TMP81]], align 4
+; I32-NEXT: [[TMP86:%.*]] = load ptr, ptr [[TMP82]], align 4
+; I32-NEXT: [[TMP87:%.*]] = extractelement <4 x double> [[TMP78]], i32 0
+; I32-NEXT: store double [[TMP87]], ptr [[TMP83]], align 4
+; I32-NEXT: [[TMP88:%.*]] = extractelement <4 x double> [[TMP78]], i32 1
+; I32-NEXT: store double [[TMP88]], ptr [[TMP84]], align 4
+; I32-NEXT: [[TMP89:%.*]] = extractelement <4 x double> [[TMP78]], i32 2
+; I32-NEXT: store double [[TMP89]], ptr [[TMP85]], align 4
+; I32-NEXT: [[TMP90:%.*]] = extractelement <4 x double> [[TMP78]], i32 3
+; I32-NEXT: store double [[TMP90]], ptr [[TMP86]], align 4
+; I32-NEXT: [[INDEX_NEXT6]] = add nuw i32 [[INDEX4]], 4
+; I32-NEXT: [[VEC_IND_NEXT7]] = add <4 x i32> [[VEC_IND5]], splat (i32 4)
+; I32-NEXT: [[TMP91:%.*]] = icmp eq i32 [[INDEX_NEXT6]], [[N_VEC3]]
+; I32-NEXT: br i1 [[TMP91]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; I32: [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; I32-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]]
+; I32: [[VEC_EPILOG_SCALAR_PH]]:
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ 0, %entry ], [ %inc, %loop ]
+ %conv = uitofp i32 %iv to double
+ %add.ptr.i = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 %iv
+ %0 = load ptr, ptr %add.ptr.i, align 4
+ store double %conv, ptr %0, align 4
+ %inc = add i32 %iv, 1
+ %ec = icmp eq i32 %iv, %n
+ br i1 %ec, label %exit, label %loop
+
+exit: ; preds = %loop
+ ret void
+}
+
+define void @test_store_loaded_value(ptr noalias %src, ptr noalias %dst, i32 %n) #0 {
+; I64-LABEL: define void @test_store_loaded_value(
+; I64-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; I64-NEXT: [[BB:.*:]]
+; I64-NEXT: [[PRE:%.*]] = icmp slt i32 [[N]], 1
+; I64-NEXT: br i1 [[PRE]], [[EXIT:label %.*]], label %[[PH:.*]]
+; I64: [[PH]]:
+; I64-NEXT: [[N_EXT:%.*]] = zext i32 [[N]] to i64
+; I64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_EXT]], 4
+; I64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; I64: [[VECTOR_PH]]:
+; I64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_EXT]], 4
+; I64-NEXT: [[N_VEC:%.*]] = sub i64 [[N_EXT]], [[N_MOD_VF]]
+; I64-NEXT: br label %[[VECTOR_BODY:.*]]
+; I64: [[VECTOR_BODY]]:
+; I64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I64-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; I64-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; I64-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; I64-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; I64-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP0]]
+; I64-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP1]]
+; I64-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP2]]
+; I64-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; I64-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP4]], align 8
+; I64-NEXT: [[TMP9:%.*]] = load double, ptr [[TMP5]], align 8
+; I64-NEXT: [[TMP10:%.*]] = load double, ptr [[TMP6]], align 8
+; I64-NEXT: [[TMP11:%.*]] = load double, ptr [[TMP7]], align 8
+; I64-NEXT: [[TMP12:%.*]] = shl i64 [[TMP0]], 1
+; I64-NEXT: [[TMP13:%.*]] = shl i64 [[TMP1]], 1
+; I64-NEXT: [[TMP14:%.*]] = shl i64 [[TMP2]], 1
+; I64-NEXT: [[TMP15:%.*]] = shl i64 [[TMP3]], 1
+; I64-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
+; I64-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
+; I64-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]]
+; I64-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP15]]
+; I64-NEXT: store double [[TMP8]], ptr [[TMP16]], align 8
+; I64-NEXT: store double [[TMP9]], ptr [[TMP17]], align 8
+; I64-NEXT: store double [[TMP10]], ptr [[TMP18]], align 8
+; I64-NEXT: store double [[TMP11]], ptr [[TMP19]], align 8
+; I64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; I64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; I64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; I64: [[MIDDLE_BLOCK]]:
+; I64-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_EXT]], [[N_VEC]]
+; I64-NEXT: br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
+; I64: [[SCALAR_PH]]:
+;
+; I32-LABEL: define void @test_store_loaded_value(
+; I32-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; I32-NEXT: [[BB:.*:]]
+; I32-NEXT: [[PRE:%.*]] = icmp slt i32 [[N]], 1
+; I32-NEXT: br i1 [[PRE]], [[EXIT:label %.*]], label %[[PH:.*]]
+; I32: [[PH]]:
+; I32-NEXT: [[N_EXT:%.*]] = zext i32 [[N]] to i64
+; I32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_EXT]], 4
+; I32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; I32: [[VECTOR_PH]]:
+; I32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_EXT]], 4
+; I32-NEXT: [[N_VEC:%.*]] = sub i64 [[N_EXT]], [[N_MOD_VF]]
+; I32-NEXT: br label %[[VECTOR_BODY:.*]]
+; I32: [[VECTOR_BODY]]:
+; I32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I32-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; I32-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; I32-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; I32-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; I32-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP0]]
+; I32-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP1]]
+; I32-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP2]]
+; I32-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; I32-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP4]], align 8
+; I32-NEXT: [[TMP9:%.*]] = load double, ptr [[TMP5]], align 8
+; I32-NEXT: [[TMP10:%.*]] = load double, ptr [[TMP6]], align 8
+; I32-NEXT: [[TMP11:%.*]] = load double, ptr [[TMP7]], align 8
+; I32-NEXT: [[TMP12:%.*]] = shl i64 [[TMP0]], 1
+; I32-NEXT: [[TMP13:%.*]] = shl i64 [[TMP1]], 1
+; I32-NEXT: [[TMP14:%.*]] = shl i64 [[TMP2]], 1
+; I32-NEXT: [[TMP15:%.*]] = shl i64 [[TMP3]], 1
+; I32-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
+; I32-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
+; I32-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]]
+; I32-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP15]]
+; I32-NEXT: store double [[TMP8]], ptr [[TMP16]], align 8
+; I32-NEXT: store double [[TMP9]], ptr [[TMP17]], align 8
+; I32-NEXT: store double [[TMP10]], ptr [[TMP18]], align 8
+; I32-NEXT: store double [[TMP11]], ptr [[TMP19]], align 8
+; I32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; I32-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; I32-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; I32: [[MIDDLE_BLOCK]]:
+; I32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_EXT]], [[N_VEC]]
+; I32-NEXT: br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
+; I32: [[SCALAR_PH]]:
+;
+bb:
+ %pre = icmp slt i32 %n, 1
+ br i1 %pre, label %exit, label %ph
+
+ph:
+ %n.ext = zext i32 %n to i64
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %ph ], [ %iv.next, %loop ]
+ %iv.next = add i64 %iv, 1
+ %gep.src = getelementptr i8, ptr %src, i64 %iv
+ %l = load double, ptr %gep.src, align 8
+ %sext = shl i64 %iv, 1
+ %gep.dst = getelementptr i8, ptr %dst, i64 %sext
+ store double %l, ptr %gep.dst, align 8
+ %ec = icmp eq i64 %iv.next, %n.ext
+ br i1 %ec, label %exit, label %loop, !llvm.loop !0
+
+exit:
+ ret void
+}
+
+attributes #0 = { "target-cpu"="znver2" }
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
index 4f52227..02e05b2 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
@@ -527,23 +527,14 @@ define void @rt_stride_1_with_reordering(ptr %pl, i64 %stride, ptr %ps) {
ret void
}
-; TODO: We want to generate this code:
-; define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
-; %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 %offset0
-; %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
-; %strided_load = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 %gep_l0, i64 8, <4 x i1> splat (i1 true), i32 4)
-; %bitcast_ = bitcast <4 x i32> %strided_load to <16 x i8>
-; store <16 x i8> %bitcast_, ptr %gep_s0, align 1
-; ret void
-; }
-define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
-; CHECK-LABEL: define void @constant_stride_widen_no_reordering(
+define void @constant_stride_masked_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
+; CHECK-LABEL: define void @constant_stride_masked_no_reordering(
; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
; CHECK-NEXT: [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
; CHECK-NEXT: [[TMP1:%.*]] = call <28 x i8> @llvm.masked.load.v28i8.p0(ptr [[GEP_L0]], i32 1, <28 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <28 x i8> poison)
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <28 x i8> [[TMP1]], <28 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
-; CHECK-NEXT: store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 1
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <28 x i8> [[TMP1]], <28 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT: store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 1
; CHECK-NEXT: ret void
;
%gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
@@ -618,6 +609,107 @@ define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps)
}
; TODO: We want to generate this code:
+; define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) #0 {
+; %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
+; %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
+; %1 = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 %gep_l0, i64 100, <4 x i1> splat (i1 true), i32 4)
+; %2 = bitcast <4 x i32> %1 to <16 x i8>
+; store <16 x i8> %2, ptr %gep_s0, align 1
+; ret void
+; }
+define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
+; CHECK-LABEL: define void @constant_stride_widen_no_reordering(
+; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
+; CHECK-NEXT: [[GEP_L4:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 100
+; CHECK-NEXT: [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 200
+; CHECK-NEXT: [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 300
+; CHECK-NEXT: [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[GEP_L0]], align 1
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_L4]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_L8]], align 1
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[GEP_L12]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT: store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 1
+; CHECK-NEXT: ret void
+;
+ %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
+ %gep_l1 = getelementptr inbounds i8, ptr %pl, i64 1
+ %gep_l2 = getelementptr inbounds i8, ptr %pl, i64 2
+ %gep_l3 = getelementptr inbounds i8, ptr %pl, i64 3
+ %gep_l4 = getelementptr inbounds i8, ptr %pl, i64 100
+ %gep_l5 = getelementptr inbounds i8, ptr %pl, i64 101
+ %gep_l6 = getelementptr inbounds i8, ptr %pl, i64 102
+ %gep_l7 = getelementptr inbounds i8, ptr %pl, i64 103
+ %gep_l8 = getelementptr inbounds i8, ptr %pl, i64 200
+ %gep_l9 = getelementptr inbounds i8, ptr %pl, i64 201
+ %gep_l10 = getelementptr inbounds i8, ptr %pl, i64 202
+ %gep_l11 = getelementptr inbounds i8, ptr %pl, i64 203
+ %gep_l12 = getelementptr inbounds i8, ptr %pl, i64 300
+ %gep_l13 = getelementptr inbounds i8, ptr %pl, i64 301
+ %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 302
+ %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 303
+
+ %load0 = load i8, ptr %gep_l0 , align 1
+ %load1 = load i8, ptr %gep_l1 , align 1
+ %load2 = load i8, ptr %gep_l2 , align 1
+ %load3 = load i8, ptr %gep_l3 , align 1
+ %load4 = load i8, ptr %gep_l4 , align 1
+ %load5 = load i8, ptr %gep_l5 , align 1
+ %load6 = load i8, ptr %gep_l6 , align 1
+ %load7 = load i8, ptr %gep_l7 , align 1
+ %load8 = load i8, ptr %gep_l8 , align 1
+ %load9 = load i8, ptr %gep_l9 , align 1
+ %load10 = load i8, ptr %gep_l10, align 1
+ %load11 = load i8, ptr %gep_l11, align 1
+ %load12 = load i8, ptr %gep_l12, align 1
+ %load13 = load i8, ptr %gep_l13, align 1
+ %load14 = load i8, ptr %gep_l14, align 1
+ %load15 = load i8, ptr %gep_l15, align 1
+
+ %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
+ %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
+ %gep_s2 = getelementptr inbounds i8, ptr %ps, i64 2
+ %gep_s3 = getelementptr inbounds i8, ptr %ps, i64 3
+ %gep_s4 = getelementptr inbounds i8, ptr %ps, i64 4
+ %gep_s5 = getelementptr inbounds i8, ptr %ps, i64 5
+ %gep_s6 = getelementptr inbounds i8, ptr %ps, i64 6
+ %gep_s7 = getelementptr inbounds i8, ptr %ps, i64 7
+ %gep_s8 = getelementptr inbounds i8, ptr %ps, i64 8
+ %gep_s9 = getelementptr inbounds i8, ptr %ps, i64 9
+ %gep_s10 = getelementptr inbounds i8, ptr %ps, i64 10
+ %gep_s11 = getelementptr inbounds i8, ptr %ps, i64 11
+ %gep_s12 = getelementptr inbounds i8, ptr %ps, i64 12
+ %gep_s13 = getelementptr inbounds i8, ptr %ps, i64 13
+ %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14
+ %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
+
+ store i8 %load0, ptr %gep_s0, align 1
+ store i8 %load1, ptr %gep_s1, align 1
+ store i8 %load2, ptr %gep_s2, align 1
+ store i8 %load3, ptr %gep_s3, align 1
+ store i8 %load4, ptr %gep_s4, align 1
+ store i8 %load5, ptr %gep_s5, align 1
+ store i8 %load6, ptr %gep_s6, align 1
+ store i8 %load7, ptr %gep_s7, align 1
+ store i8 %load8, ptr %gep_s8, align 1
+ store i8 %load9, ptr %gep_s9, align 1
+ store i8 %load10, ptr %gep_s10, align 1
+ store i8 %load11, ptr %gep_s11, align 1
+ store i8 %load12, ptr %gep_s12, align 1
+ store i8 %load13, ptr %gep_s13, align 1
+ store i8 %load14, ptr %gep_s14, align 1
+ store i8 %load15, ptr %gep_s15, align 1
+
+ ret void
+}
+; TODO: We want to generate this code:
; define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
; %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 %offset0
; %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
diff --git a/llvm/test/Unit/CMakeLists.txt b/llvm/test/Unit/CMakeLists.txt
new file mode 100644
index 0000000..6b0abe1
--- /dev/null
+++ b/llvm/test/Unit/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_lit_testsuite(check-llvm-unit "Running lit suite for LLVM unit tests"
+ ${CMAKE_CURRENT_BINARY_DIR}
+ EXCLUDE_FROM_CHECK_ALL
+ DEPENDS UnitTests
+ )
diff --git a/llvm/test/tools/llvm-ir2vec/entities.ll b/llvm/test/tools/llvm-ir2vec/entities.ll
index 4b51adf..8dbce57 100644
--- a/llvm/test/tools/llvm-ir2vec/entities.ll
+++ b/llvm/test/tools/llvm-ir2vec/entities.ll
@@ -1,6 +1,6 @@
; RUN: llvm-ir2vec entities | FileCheck %s
-CHECK: 84
+CHECK: 110
CHECK-NEXT: Ret 0
CHECK-NEXT: Br 1
CHECK-NEXT: Switch 2
@@ -85,3 +85,29 @@ CHECK-NEXT: Function 80
CHECK-NEXT: Pointer 81
CHECK-NEXT: Constant 82
CHECK-NEXT: Variable 83
+CHECK-NEXT: FCMP_false 84
+CHECK-NEXT: FCMP_oeq 85
+CHECK-NEXT: FCMP_ogt 86
+CHECK-NEXT: FCMP_oge 87
+CHECK-NEXT: FCMP_olt 88
+CHECK-NEXT: FCMP_ole 89
+CHECK-NEXT: FCMP_one 90
+CHECK-NEXT: FCMP_ord 91
+CHECK-NEXT: FCMP_uno 92
+CHECK-NEXT: FCMP_ueq 93
+CHECK-NEXT: FCMP_ugt 94
+CHECK-NEXT: FCMP_uge 95
+CHECK-NEXT: FCMP_ult 96
+CHECK-NEXT: FCMP_ule 97
+CHECK-NEXT: FCMP_une 98
+CHECK-NEXT: FCMP_true 99
+CHECK-NEXT: ICMP_eq 100
+CHECK-NEXT: ICMP_ne 101
+CHECK-NEXT: ICMP_ugt 102
+CHECK-NEXT: ICMP_uge 103
+CHECK-NEXT: ICMP_ult 104
+CHECK-NEXT: ICMP_ule 105
+CHECK-NEXT: ICMP_sgt 106
+CHECK-NEXT: ICMP_sge 107
+CHECK-NEXT: ICMP_slt 108
+CHECK-NEXT: ICMP_sle 109
diff --git a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
index 047557e..ea89c4d 100644
--- a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
+++ b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
@@ -83,7 +83,9 @@ static CGDataAction Action;
static std::optional<CGDataFormat> OutputFormat;
static std::vector<std::string> InputFilenames;
+namespace llvm {
extern cl::opt<bool> IndexedCodeGenDataLazyLoading;
+} // end namespace llvm
static void exitWithError(Twine Message, StringRef Whence = "",
StringRef Hint = "") {
diff --git a/llvm/tools/llvm-config/llvm-config.cpp b/llvm/tools/llvm-config/llvm-config.cpp
index 49df8fd..7f8c55a 100644
--- a/llvm/tools/llvm-config/llvm-config.cpp
+++ b/llvm/tools/llvm-config/llvm-config.cpp
@@ -357,18 +357,18 @@ int main(int argc, char **argv) {
ActivePrefix = CurrentExecPrefix;
{
SmallString<256> Path(LLVM_INSTALL_INCLUDEDIR);
- sys::fs::make_absolute(ActivePrefix, Path);
+ sys::path::make_absolute(ActivePrefix, Path);
ActiveIncludeDir = std::string(Path);
}
{
SmallString<256> Path(LLVM_TOOLS_INSTALL_DIR);
- sys::fs::make_absolute(ActivePrefix, Path);
+ sys::path::make_absolute(ActivePrefix, Path);
ActiveBinDir = std::string(Path);
}
ActiveLibDir = ActivePrefix + "/lib" + LLVM_LIBDIR_SUFFIX;
{
SmallString<256> Path(LLVM_INSTALL_PACKAGE_DIR);
- sys::fs::make_absolute(ActivePrefix, Path);
+ sys::path::make_absolute(ActivePrefix, Path);
ActiveCMakeDir = std::string(Path);
}
ActiveIncludeOption = "-I" + ActiveIncludeDir;
diff --git a/llvm/tools/llvm-dwp/llvm-dwp.cpp b/llvm/tools/llvm-dwp/llvm-dwp.cpp
index 61ba82d..31bad2d 100644
--- a/llvm/tools/llvm-dwp/llvm-dwp.cpp
+++ b/llvm/tools/llvm-dwp/llvm-dwp.cpp
@@ -94,7 +94,7 @@ getDWOFilenames(StringRef ExecFilename) {
dwarf::toString(Die.find(dwarf::DW_AT_comp_dir), "");
if (!DWOCompDir.empty()) {
SmallString<16> DWOPath(DWOName);
- sys::fs::make_absolute(DWOCompDir, DWOPath);
+ sys::path::make_absolute(DWOCompDir, DWOPath);
if (!sys::fs::exists(DWOPath) && sys::fs::exists(DWOName))
DWOPaths.push_back(std::move(DWOName));
else
diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index aabebf0..434449c 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -162,8 +162,8 @@ public:
for (const BasicBlock &BB : F) {
for (const auto &I : BB.instructionsWithoutDebug()) {
- unsigned Opcode = Vocabulary::getSlotIndex(I.getOpcode());
- unsigned TypeID = Vocabulary::getSlotIndex(I.getType()->getTypeID());
+ unsigned Opcode = Vocabulary::getIndex(I.getOpcode());
+ unsigned TypeID = Vocabulary::getIndex(I.getType()->getTypeID());
// Add "Next" relationship with previous instruction
if (HasPrevOpcode) {
@@ -184,7 +184,7 @@ public:
// Add "Arg" relationships
unsigned ArgIndex = 0;
for (const Use &U : I.operands()) {
- unsigned OperandID = Vocabulary::getSlotIndex(*U);
+ unsigned OperandID = Vocabulary::getIndex(*U.get());
unsigned RelationID = ArgRelation + ArgIndex;
OS << Opcode << '\t' << OperandID << '\t' << RelationID << '\n';
diff --git a/llvm/tools/llvm-opt-report/OptReport.cpp b/llvm/tools/llvm-opt-report/OptReport.cpp
index 68ed92c..e4b4fc2 100644
--- a/llvm/tools/llvm-opt-report/OptReport.cpp
+++ b/llvm/tools/llvm-opt-report/OptReport.cpp
@@ -274,7 +274,7 @@ static bool writeReport(LocationInfoTy &LocationInfo) {
for (auto &FI : LocationInfo) {
SmallString<128> FileName(FI.first);
if (!InputRelDir.empty())
- sys::fs::make_absolute(InputRelDir, FileName);
+ sys::path::make_absolute(InputRelDir, FileName);
const auto &FileInfo = FI.second;
diff --git a/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp b/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
index dc6059d..b6e8567 100644
--- a/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
+++ b/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
@@ -43,8 +43,11 @@ class FunctionPropertiesAnalysisTest : public testing::Test {
public:
FunctionPropertiesAnalysisTest() {
auto VocabVector = ir2vec::Vocabulary::createDummyVocabForTest(1);
- MAM.registerPass([&] { return IR2VecVocabAnalysis(VocabVector); });
- IR2VecVocab = ir2vec::Vocabulary(std::move(VocabVector));
+ MAM.registerPass([VocabVector = std::move(VocabVector)]() mutable {
+ return IR2VecVocabAnalysis(std::move(VocabVector));
+ });
+ IR2VecVocab =
+ new ir2vec::Vocabulary(ir2vec::Vocabulary::createDummyVocabForTest(1));
MAM.registerPass([&] { return PassInstrumentationAnalysis(); });
FAM.registerPass([&] { return ModuleAnalysisManagerFunctionProxy(MAM); });
FAM.registerPass([&] { return DominatorTreeAnalysis(); });
@@ -66,7 +69,7 @@ protected:
std::unique_ptr<LoopInfo> LI;
FunctionAnalysisManager FAM;
ModuleAnalysisManager MAM;
- ir2vec::Vocabulary IR2VecVocab;
+ ir2vec::Vocabulary *IR2VecVocab;
void TearDown() override {
// Restore original IR2Vec weights
@@ -78,7 +81,7 @@ protected:
FunctionPropertiesInfo buildFPI(Function &F) {
// FunctionPropertiesInfo assumes IR2VecVocabAnalysis has been run to
// use IR2Vec.
- auto VocabResult = MAM.getResult<IR2VecVocabAnalysis>(*F.getParent());
+ auto &VocabResult = MAM.getResult<IR2VecVocabAnalysis>(*F.getParent());
(void)VocabResult;
return FunctionPropertiesInfo::getFunctionPropertiesInfo(F, FAM);
}
@@ -106,7 +109,7 @@ protected:
}
std::unique_ptr<ir2vec::Embedder> createEmbedder(const Function &F) {
- auto Emb = ir2vec::Embedder::create(IR2VecKind::Symbolic, F, IR2VecVocab);
+ auto Emb = ir2vec::Embedder::create(IR2VecKind::Symbolic, F, *IR2VecVocab);
EXPECT_TRUE(static_cast<bool>(Emb));
return Emb;
}
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp b/llvm/unittests/Analysis/IR2VecTest.cpp
index 9f2f6a3..743628f 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -295,7 +295,7 @@ TEST(IR2VecTest, ZeroDimensionEmbedding) {
// Fixture for IR2Vec tests requiring IR setup.
class IR2VecTestFixture : public ::testing::Test {
protected:
- Vocabulary V;
+ Vocabulary *V;
LLVMContext Ctx;
std::unique_ptr<Module> M;
Function *F = nullptr;
@@ -304,7 +304,7 @@ protected:
Instruction *RetInst = nullptr;
void SetUp() override {
- V = Vocabulary(Vocabulary::createDummyVocabForTest(2));
+ V = new Vocabulary(Vocabulary::createDummyVocabForTest(2));
// Setup IR
M = std::make_unique<Module>("TestM", Ctx);
@@ -322,7 +322,7 @@ protected:
};
TEST_F(IR2VecTestFixture, GetInstVecMap_Symbolic) {
- auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, V);
+ auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, *V);
ASSERT_TRUE(static_cast<bool>(Emb));
const auto &InstMap = Emb->getInstVecMap();
@@ -341,7 +341,7 @@ TEST_F(IR2VecTestFixture, GetInstVecMap_Symbolic) {
}
TEST_F(IR2VecTestFixture, GetInstVecMap_FlowAware) {
- auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, V);
+ auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, *V);
ASSERT_TRUE(static_cast<bool>(Emb));
const auto &InstMap = Emb->getInstVecMap();
@@ -358,7 +358,7 @@ TEST_F(IR2VecTestFixture, GetInstVecMap_FlowAware) {
}
TEST_F(IR2VecTestFixture, GetBBVecMap_Symbolic) {
- auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, V);
+ auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, *V);
ASSERT_TRUE(static_cast<bool>(Emb));
const auto &BBMap = Emb->getBBVecMap();
@@ -373,7 +373,7 @@ TEST_F(IR2VecTestFixture, GetBBVecMap_Symbolic) {
}
TEST_F(IR2VecTestFixture, GetBBVecMap_FlowAware) {
- auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, V);
+ auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, *V);
ASSERT_TRUE(static_cast<bool>(Emb));
const auto &BBMap = Emb->getBBVecMap();
@@ -388,7 +388,7 @@ TEST_F(IR2VecTestFixture, GetBBVecMap_FlowAware) {
}
TEST_F(IR2VecTestFixture, GetBBVector_Symbolic) {
- auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, V);
+ auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, *V);
ASSERT_TRUE(static_cast<bool>(Emb));
const auto &BBVec = Emb->getBBVector(*BB);
@@ -398,7 +398,7 @@ TEST_F(IR2VecTestFixture, GetBBVector_Symbolic) {
}
TEST_F(IR2VecTestFixture, GetBBVector_FlowAware) {
- auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, V);
+ auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, *V);
ASSERT_TRUE(static_cast<bool>(Emb));
const auto &BBVec = Emb->getBBVector(*BB);
@@ -408,7 +408,7 @@ TEST_F(IR2VecTestFixture, GetBBVector_FlowAware) {
}
TEST_F(IR2VecTestFixture, GetFunctionVector_Symbolic) {
- auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, V);
+ auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, *V);
ASSERT_TRUE(static_cast<bool>(Emb));
const auto &FuncVec = Emb->getFunctionVector();
@@ -420,7 +420,7 @@ TEST_F(IR2VecTestFixture, GetFunctionVector_Symbolic) {
}
TEST_F(IR2VecTestFixture, GetFunctionVector_FlowAware) {
- auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, V);
+ auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, *V);
ASSERT_TRUE(static_cast<bool>(Emb));
const auto &FuncVec = Emb->getFunctionVector();
@@ -435,6 +435,7 @@ static constexpr unsigned MaxOpcodes = Vocabulary::MaxOpcodes;
static constexpr unsigned MaxTypeIDs = Vocabulary::MaxTypeIDs;
static constexpr unsigned MaxCanonicalTypeIDs = Vocabulary::MaxCanonicalTypeIDs;
static constexpr unsigned MaxOperands = Vocabulary::MaxOperandKinds;
+static constexpr unsigned MaxPredicateKinds = Vocabulary::MaxPredicateKinds;
// Mapping between LLVM Type::TypeID tokens and Vocabulary::CanonicalTypeID
// names and their canonical string keys.
@@ -460,9 +461,13 @@ TEST(IR2VecVocabularyTest, DummyVocabTest) {
EXPECT_EQ(Emb.size(), Dim);
// Should have the correct total number of embeddings
- EXPECT_EQ(VocabVecSize, MaxOpcodes + MaxCanonicalTypeIDs + MaxOperands);
+ EXPECT_EQ(VocabVecSize, MaxOpcodes + MaxCanonicalTypeIDs + MaxOperands +
+ MaxPredicateKinds);
- auto ExpectedVocab = VocabVec;
+ // Collect embeddings for later comparison before moving VocabVec
+ std::vector<Embedding> ExpectedVocab;
+ for (const auto &Emb : VocabVec)
+ ExpectedVocab.push_back(Emb);
IR2VecVocabAnalysis VocabAnalysis(std::move(VocabVec));
LLVMContext TestCtx;
@@ -480,17 +485,17 @@ TEST(IR2VecVocabularyTest, DummyVocabTest) {
}
TEST(IR2VecVocabularyTest, SlotIdxMapping) {
- // Test getSlotIndex for Opcodes
+ // Test getIndex for Opcodes
#define EXPECT_OPCODE_SLOT(NUM, OPCODE, CLASS) \
- EXPECT_EQ(Vocabulary::getSlotIndex(NUM), static_cast<unsigned>(NUM - 1));
+ EXPECT_EQ(Vocabulary::getIndex(NUM), static_cast<unsigned>(NUM - 1));
#define HANDLE_INST(NUM, OPCODE, CLASS) EXPECT_OPCODE_SLOT(NUM, OPCODE, CLASS)
#include "llvm/IR/Instruction.def"
#undef HANDLE_INST
#undef EXPECT_OPCODE_SLOT
- // Test getSlotIndex for Types
+ // Test getIndex for Types
#define EXPECT_TYPE_SLOT(TypeIDTok, CanonEnum, CanonStr) \
- EXPECT_EQ(Vocabulary::getSlotIndex(Type::TypeIDTok), \
+ EXPECT_EQ(Vocabulary::getIndex(Type::TypeIDTok), \
MaxOpcodes + static_cast<unsigned>( \
Vocabulary::CanonicalTypeID::CanonEnum));
@@ -498,7 +503,7 @@ TEST(IR2VecVocabularyTest, SlotIdxMapping) {
#undef EXPECT_TYPE_SLOT
- // Test getSlotIndex for Value operands
+ // Test getIndex for Value operands
LLVMContext Ctx;
Module M("TestM", Ctx);
FunctionType *FTy =
@@ -508,40 +513,59 @@ TEST(IR2VecVocabularyTest, SlotIdxMapping) {
#define EXPECTED_VOCAB_OPERAND_SLOT(X) \
MaxOpcodes + MaxCanonicalTypeIDs + static_cast<unsigned>(X)
// Test Function operand
- EXPECT_EQ(Vocabulary::getSlotIndex(*F),
+ EXPECT_EQ(Vocabulary::getIndex(*F),
EXPECTED_VOCAB_OPERAND_SLOT(Vocabulary::OperandKind::FunctionID));
// Test Constant operand
Constant *C = ConstantInt::get(Type::getInt32Ty(Ctx), 42);
- EXPECT_EQ(Vocabulary::getSlotIndex(*C),
+ EXPECT_EQ(Vocabulary::getIndex(*C),
EXPECTED_VOCAB_OPERAND_SLOT(Vocabulary::OperandKind::ConstantID));
// Test Pointer operand
BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
AllocaInst *PtrVal = new AllocaInst(Type::getInt32Ty(Ctx), 0, "ptr", BB);
- EXPECT_EQ(Vocabulary::getSlotIndex(*PtrVal),
+ EXPECT_EQ(Vocabulary::getIndex(*PtrVal),
EXPECTED_VOCAB_OPERAND_SLOT(Vocabulary::OperandKind::PointerID));
// Test Variable operand (function argument)
Argument *Arg = F->getArg(0);
- EXPECT_EQ(Vocabulary::getSlotIndex(*Arg),
+ EXPECT_EQ(Vocabulary::getIndex(*Arg),
EXPECTED_VOCAB_OPERAND_SLOT(Vocabulary::OperandKind::VariableID));
#undef EXPECTED_VOCAB_OPERAND_SLOT
+
+ // Test getIndex for predicates
+#define EXPECTED_VOCAB_PREDICATE_SLOT(X) \
+ MaxOpcodes + MaxCanonicalTypeIDs + MaxOperands + static_cast<unsigned>(X)
+ for (unsigned P = CmpInst::FIRST_FCMP_PREDICATE;
+ P <= CmpInst::LAST_FCMP_PREDICATE; ++P) {
+ CmpInst::Predicate Pred = static_cast<CmpInst::Predicate>(P);
+ unsigned ExpectedIdx =
+ EXPECTED_VOCAB_PREDICATE_SLOT((P - CmpInst::FIRST_FCMP_PREDICATE));
+ EXPECT_EQ(Vocabulary::getIndex(Pred), ExpectedIdx);
+ }
+ auto ICMP_Start = CmpInst::LAST_FCMP_PREDICATE + 1;
+ for (unsigned P = CmpInst::FIRST_ICMP_PREDICATE;
+ P <= CmpInst::LAST_ICMP_PREDICATE; ++P) {
+ CmpInst::Predicate Pred = static_cast<CmpInst::Predicate>(P);
+ unsigned ExpectedIdx = EXPECTED_VOCAB_PREDICATE_SLOT(
+ ICMP_Start + P - CmpInst::FIRST_ICMP_PREDICATE);
+ EXPECT_EQ(Vocabulary::getIndex(Pred), ExpectedIdx);
+ }
+#undef EXPECTED_VOCAB_PREDICATE_SLOT
}
#if GTEST_HAS_DEATH_TEST
#ifndef NDEBUG
TEST(IR2VecVocabularyTest, NumericIDMapInvalidInputs) {
// Test invalid opcode IDs
- EXPECT_DEATH(Vocabulary::getSlotIndex(0u), "Invalid opcode");
- EXPECT_DEATH(Vocabulary::getSlotIndex(MaxOpcodes + 1), "Invalid opcode");
+ EXPECT_DEATH(Vocabulary::getIndex(0u), "Invalid opcode");
+ EXPECT_DEATH(Vocabulary::getIndex(MaxOpcodes + 1), "Invalid opcode");
// Test invalid type IDs
- EXPECT_DEATH(Vocabulary::getSlotIndex(static_cast<Type::TypeID>(MaxTypeIDs)),
+ EXPECT_DEATH(Vocabulary::getIndex(static_cast<Type::TypeID>(MaxTypeIDs)),
+ "Invalid type ID");
+ EXPECT_DEATH(Vocabulary::getIndex(static_cast<Type::TypeID>(MaxTypeIDs + 10)),
"Invalid type ID");
- EXPECT_DEATH(
- Vocabulary::getSlotIndex(static_cast<Type::TypeID>(MaxTypeIDs + 10)),
- "Invalid type ID");
}
#endif // NDEBUG
#endif // GTEST_HAS_DEATH_TEST
@@ -551,7 +575,7 @@ TEST(IR2VecVocabularyTest, StringKeyGeneration) {
EXPECT_EQ(Vocabulary::getStringKey(12), "Add");
#define EXPECT_OPCODE(NUM, OPCODE, CLASS) \
- EXPECT_EQ(Vocabulary::getStringKey(Vocabulary::getSlotIndex(NUM)), \
+ EXPECT_EQ(Vocabulary::getStringKey(Vocabulary::getIndex(NUM)), \
Vocabulary::getVocabKeyForOpcode(NUM));
#define HANDLE_INST(NUM, OPCODE, CLASS) EXPECT_OPCODE(NUM, OPCODE, CLASS)
#include "llvm/IR/Instruction.def"
@@ -569,6 +593,7 @@ TEST(IR2VecVocabularyTest, StringKeyGeneration) {
#undef EXPECT_CANONICAL_TYPE_NAME
+ // Verify OperandKind -> string mapping
#define HANDLE_OPERAND_KINDS(X) \
X(FunctionID, "Function") \
X(PointerID, "Pointer") \
@@ -592,6 +617,28 @@ TEST(IR2VecVocabularyTest, StringKeyGeneration) {
Vocabulary::getStringKey(MaxOpcodes + MaxCanonicalTypeIDs + 1);
EXPECT_EQ(FuncArgKey, "Function");
EXPECT_EQ(PtrArgKey, "Pointer");
+
+// Verify PredicateKind -> string mapping
+#define EXPECT_PREDICATE_KIND(PredNum, PredPos, PredKind) \
+ do { \
+ std::string PredStr = \
+ std::string(PredKind) + "_" + \
+ CmpInst::getPredicateName(static_cast<CmpInst::Predicate>(PredNum)) \
+ .str(); \
+ unsigned Pos = MaxOpcodes + MaxCanonicalTypeIDs + MaxOperands + PredPos; \
+ EXPECT_EQ(Vocabulary::getStringKey(Pos), PredStr); \
+ } while (0)
+
+ for (unsigned P = CmpInst::FIRST_FCMP_PREDICATE;
+ P <= CmpInst::LAST_FCMP_PREDICATE; ++P)
+ EXPECT_PREDICATE_KIND(P, P - CmpInst::FIRST_FCMP_PREDICATE, "FCMP");
+
+ auto ICMP_Pos = CmpInst::LAST_FCMP_PREDICATE + 1;
+ for (unsigned P = CmpInst::FIRST_ICMP_PREDICATE;
+ P <= CmpInst::LAST_ICMP_PREDICATE; ++P)
+ EXPECT_PREDICATE_KIND(P, ICMP_Pos++, "ICMP");
+
+#undef EXPECT_PREDICATE_KIND
}
TEST(IR2VecVocabularyTest, VocabularyDimensions) {
@@ -627,10 +674,12 @@ TEST(IR2VecVocabularyTest, InvalidAccess) {
#endif // GTEST_HAS_DEATH_TEST
TEST(IR2VecVocabularyTest, TypeIDStringKeyMapping) {
+ Vocabulary V = Vocabulary(Vocabulary::createDummyVocabForTest());
#define EXPECT_TYPE_TO_CANONICAL(TypeIDTok, CanonEnum, CanonStr) \
- EXPECT_EQ( \
- Vocabulary::getStringKey(Vocabulary::getSlotIndex(Type::TypeIDTok)), \
- CanonStr);
+ do { \
+ unsigned FlatIdx = V.getIndex(Type::TypeIDTok); \
+ EXPECT_EQ(Vocabulary::getStringKey(FlatIdx), CanonStr); \
+ } while (0);
IR2VEC_HANDLE_TYPE_BIMAP(EXPECT_TYPE_TO_CANONICAL)
@@ -638,14 +687,20 @@ TEST(IR2VecVocabularyTest, TypeIDStringKeyMapping) {
}
TEST(IR2VecVocabularyTest, InvalidVocabularyConstruction) {
- std::vector<Embedding> InvalidVocab;
- InvalidVocab.push_back(Embedding(2, 1.0));
- InvalidVocab.push_back(Embedding(2, 2.0));
-
- Vocabulary V(std::move(InvalidVocab));
+ // Test 1: Create invalid VocabStorage with insufficient sections
+ std::vector<std::vector<Embedding>> InvalidSectionData;
+ // Only add one section with 2 embeddings, but the vocabulary needs 4 sections
+ std::vector<Embedding> Section1;
+ Section1.push_back(Embedding(2, 1.0));
+ Section1.push_back(Embedding(2, 2.0));
+ InvalidSectionData.push_back(std::move(Section1));
+
+ VocabStorage InvalidStorage(std::move(InvalidSectionData));
+ Vocabulary V(std::move(InvalidStorage));
EXPECT_FALSE(V.isValid());
{
+ // Test 2: Default-constructed vocabulary should be invalid
Vocabulary InvalidResult;
EXPECT_FALSE(InvalidResult.isValid());
#if GTEST_HAS_DEATH_TEST
@@ -656,4 +711,265 @@ TEST(IR2VecVocabularyTest, InvalidVocabularyConstruction) {
}
}
+TEST(VocabStorageTest, DefaultConstructor) {
+ VocabStorage storage;
+
+ EXPECT_EQ(storage.size(), 0u);
+ EXPECT_EQ(storage.getNumSections(), 0u);
+ EXPECT_EQ(storage.getDimension(), 0u);
+ EXPECT_FALSE(storage.isValid());
+
+ // Test iterators on empty storage
+ EXPECT_EQ(storage.begin(), storage.end());
+}
+
+TEST(VocabStorageTest, BasicConstruction) {
+ // Create test data with 3 sections
+ std::vector<std::vector<Embedding>> sectionData;
+
+ // Section 0: 2 embeddings of dimension 3
+ std::vector<Embedding> section0;
+ section0.emplace_back(std::vector<double>{1.0, 2.0, 3.0});
+ section0.emplace_back(std::vector<double>{4.0, 5.0, 6.0});
+ sectionData.push_back(std::move(section0));
+
+ // Section 1: 1 embedding of dimension 3
+ std::vector<Embedding> section1;
+ section1.emplace_back(std::vector<double>{7.0, 8.0, 9.0});
+ sectionData.push_back(std::move(section1));
+
+ // Section 2: 3 embeddings of dimension 3
+ std::vector<Embedding> section2;
+ section2.emplace_back(std::vector<double>{10.0, 11.0, 12.0});
+ section2.emplace_back(std::vector<double>{13.0, 14.0, 15.0});
+ section2.emplace_back(std::vector<double>{16.0, 17.0, 18.0});
+ sectionData.push_back(std::move(section2));
+
+ VocabStorage storage(std::move(sectionData));
+
+ EXPECT_EQ(storage.size(), 6u); // Total: 2 + 1 + 3 = 6
+ EXPECT_EQ(storage.getNumSections(), 3u);
+ EXPECT_EQ(storage.getDimension(), 3u);
+ EXPECT_TRUE(storage.isValid());
+}
+
+TEST(VocabStorageTest, SectionAccess) {
+ // Create test data
+ std::vector<std::vector<Embedding>> sectionData;
+
+ std::vector<Embedding> section0;
+ section0.emplace_back(std::vector<double>{1.0, 2.0});
+ section0.emplace_back(std::vector<double>{3.0, 4.0});
+ sectionData.push_back(std::move(section0));
+
+ std::vector<Embedding> section1;
+ section1.emplace_back(std::vector<double>{5.0, 6.0});
+ sectionData.push_back(std::move(section1));
+
+ VocabStorage storage(std::move(sectionData));
+
+ // Test section access
+ EXPECT_EQ(storage[0].size(), 2u);
+ EXPECT_EQ(storage[1].size(), 1u);
+
+ // Test embedding values
+ EXPECT_THAT(storage[0][0].getData(), ElementsAre(1.0, 2.0));
+ EXPECT_THAT(storage[0][1].getData(), ElementsAre(3.0, 4.0));
+ EXPECT_THAT(storage[1][0].getData(), ElementsAre(5.0, 6.0));
+}
+
+#if GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+TEST(VocabStorageTest, InvalidSectionAccess) {
+ std::vector<std::vector<Embedding>> sectionData;
+ std::vector<Embedding> section0;
+ section0.emplace_back(std::vector<double>{1.0, 2.0});
+ sectionData.push_back(std::move(section0));
+
+ VocabStorage storage(std::move(sectionData));
+
+ EXPECT_DEATH(storage[1], "Invalid section ID");
+ EXPECT_DEATH(storage[10], "Invalid section ID");
+}
+
+TEST(VocabStorageTest, EmptySection) {
+ std::vector<std::vector<Embedding>> sectionData;
+ std::vector<Embedding> emptySection; // Empty section
+ sectionData.push_back(std::move(emptySection));
+
+ std::vector<Embedding> validSection;
+ validSection.emplace_back(std::vector<double>{1.0});
+ sectionData.push_back(std::move(validSection));
+
+ EXPECT_DEATH(VocabStorage(std::move(sectionData)),
+ "Vocabulary section is empty");
+}
+
+TEST(VocabStorageTest, EmptyMiddleSection) {
+ std::vector<std::vector<Embedding>> sectionData;
+
+ // Valid first section
+ std::vector<Embedding> validSection1;
+ validSection1.emplace_back(std::vector<double>{1.0});
+ sectionData.push_back(std::move(validSection1));
+
+ // Empty middle section
+ std::vector<Embedding> emptySection;
+ sectionData.push_back(std::move(emptySection));
+
+ // Valid last section
+ std::vector<Embedding> validSection2;
+ validSection2.emplace_back(std::vector<double>{2.0});
+ sectionData.push_back(std::move(validSection2));
+
+ EXPECT_DEATH(VocabStorage(std::move(sectionData)),
+ "Vocabulary section is empty");
+}
+
+TEST(VocabStorageTest, NoSections) {
+ std::vector<std::vector<Embedding>> sectionData; // No sections
+
+ EXPECT_DEATH(VocabStorage(std::move(sectionData)),
+ "Vocabulary has no sections");
+}
+
+TEST(VocabStorageTest, MismatchedDimensionsAcrossSections) {
+ std::vector<std::vector<Embedding>> sectionData;
+
+ // Section 0: embeddings with dimension 2
+ std::vector<Embedding> section0;
+ section0.emplace_back(std::vector<double>{1.0, 2.0});
+ section0.emplace_back(std::vector<double>{3.0, 4.0});
+ sectionData.push_back(std::move(section0));
+
+ // Section 1: embedding with dimension 3 (mismatch!)
+ std::vector<Embedding> section1;
+ section1.emplace_back(std::vector<double>{5.0, 6.0, 7.0});
+ sectionData.push_back(std::move(section1));
+
+ EXPECT_DEATH(VocabStorage(std::move(sectionData)),
+ "All embeddings must have the same dimension");
+}
+
+TEST(VocabStorageTest, MismatchedDimensionsWithinSection) {
+ std::vector<std::vector<Embedding>> sectionData;
+
+ // Section 0: first embedding with dimension 2, second with dimension 3
+ std::vector<Embedding> section0;
+ section0.emplace_back(std::vector<double>{1.0, 2.0});
+ section0.emplace_back(std::vector<double>{3.0, 4.0, 5.0}); // Mismatch!
+ sectionData.push_back(std::move(section0));
+
+ EXPECT_DEATH(VocabStorage(std::move(sectionData)),
+ "All embeddings must have the same dimension");
+}
+#endif // NDEBUG
+#endif // GTEST_HAS_DEATH_TEST
+
+TEST(VocabStorageTest, IteratorBasics) {
+ std::vector<std::vector<Embedding>> sectionData;
+
+ std::vector<Embedding> section0;
+ section0.emplace_back(std::vector<double>{1.0, 2.0});
+ section0.emplace_back(std::vector<double>{3.0, 4.0});
+ sectionData.push_back(std::move(section0));
+
+ std::vector<Embedding> section1;
+ section1.emplace_back(std::vector<double>{5.0, 6.0});
+ sectionData.push_back(std::move(section1));
+
+ VocabStorage storage(std::move(sectionData));
+
+ // Test iterator basics
+ auto it = storage.begin();
+ auto end = storage.end();
+
+ EXPECT_NE(it, end);
+
+ // Check first embedding
+ EXPECT_THAT((*it).getData(), ElementsAre(1.0, 2.0));
+
+ // Advance to second embedding
+ ++it;
+ EXPECT_NE(it, end);
+ EXPECT_THAT((*it).getData(), ElementsAre(3.0, 4.0));
+
+ // Advance to third embedding (in section 1)
+ ++it;
+ EXPECT_NE(it, end);
+ EXPECT_THAT((*it).getData(), ElementsAre(5.0, 6.0));
+
+ // Advance past the end
+ ++it;
+ EXPECT_EQ(it, end);
+}
+
+TEST(VocabStorageTest, IteratorTraversal) {
+ std::vector<std::vector<Embedding>> sectionData;
+
+ // Section 0: 2 embeddings
+ std::vector<Embedding> section0;
+ section0.emplace_back(std::vector<double>{10.0});
+ section0.emplace_back(std::vector<double>{20.0});
+ sectionData.push_back(std::move(section0));
+
+ // Section 1: 1 embedding
+ std::vector<Embedding> section1;
+ section1.emplace_back(std::vector<double>{25.0});
+ sectionData.push_back(std::move(section1));
+
+ // Section 2: 3 embeddings
+ std::vector<Embedding> section2;
+ section2.emplace_back(std::vector<double>{30.0});
+ section2.emplace_back(std::vector<double>{40.0});
+ section2.emplace_back(std::vector<double>{50.0});
+ sectionData.push_back(std::move(section2));
+
+ VocabStorage storage(std::move(sectionData));
+
+ // Collect all values using iterator
+ std::vector<double> values;
+ for (const auto &emb : storage) {
+ EXPECT_EQ(emb.size(), 1u);
+ values.push_back(emb[0]);
+ }
+
+ // Should get all embeddings from all sections
+ EXPECT_THAT(values, ElementsAre(10.0, 20.0, 25.0, 30.0, 40.0, 50.0));
+}
+
+TEST(VocabStorageTest, IteratorComparison) {
+ std::vector<std::vector<Embedding>> sectionData;
+ std::vector<Embedding> section0;
+ section0.emplace_back(std::vector<double>{1.0});
+ section0.emplace_back(std::vector<double>{2.0});
+ sectionData.push_back(std::move(section0));
+
+ VocabStorage storage(std::move(sectionData));
+
+ auto it1 = storage.begin();
+ auto it2 = storage.begin();
+ auto end = storage.end();
+
+ // Test equality
+ EXPECT_EQ(it1, it2);
+ EXPECT_NE(it1, end);
+
+ // Advance one iterator
+ ++it1;
+ EXPECT_NE(it1, it2);
+ EXPECT_NE(it1, end);
+
+ // Advance second iterator to match
+ ++it2;
+ EXPECT_EQ(it1, it2);
+
+ // Advance both to end
+ ++it1;
+ ++it2;
+ EXPECT_EQ(it1, end);
+ EXPECT_EQ(it2, end);
+ EXPECT_EQ(it1, it2);
+}
+
} // end anonymous namespace
diff --git a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
index 8c4fd8b..d1c0f64 100644
--- a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
+++ b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
@@ -24,7 +24,9 @@
using namespace llvm;
using namespace llvm::memprof;
+namespace llvm {
LLVM_ABI extern cl::opt<bool> MemProfKeepAllNotColdContexts;
+} // end namespace llvm
namespace {
diff --git a/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp b/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp
index 45dc50e..c8752c7 100644
--- a/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp
+++ b/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp
@@ -25,9 +25,10 @@
#include "llvm/Support/raw_ostream.h"
#include "gtest/gtest.h"
-LLVM_ABI extern llvm::cl::opt<bool> ScalePartialSampleProfileWorkingSetSize;
-
namespace llvm {
+
+LLVM_ABI extern cl::opt<bool> ScalePartialSampleProfileWorkingSetSize;
+
namespace {
class ProfileSummaryInfoTest : public testing::Test {
diff --git a/llvm/unittests/CodeGen/RegAllocScoreTest.cpp b/llvm/unittests/CodeGen/RegAllocScoreTest.cpp
index 86bfc7a..432dc93 100644
--- a/llvm/unittests/CodeGen/RegAllocScoreTest.cpp
+++ b/llvm/unittests/CodeGen/RegAllocScoreTest.cpp
@@ -31,11 +31,14 @@
#include "gtest/gtest.h"
using namespace llvm;
+
+namespace llvm {
LLVM_ABI extern cl::opt<double> CopyWeight;
LLVM_ABI extern cl::opt<double> LoadWeight;
LLVM_ABI extern cl::opt<double> StoreWeight;
LLVM_ABI extern cl::opt<double> CheapRematWeight;
LLVM_ABI extern cl::opt<double> ExpensiveRematWeight;
+} // namespace llvm
namespace {
// Include helper functions to ease the manipulation of MachineFunctions.
diff --git a/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp b/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
index 1eb03f1..451c376 100644
--- a/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
+++ b/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
@@ -266,7 +266,8 @@ TEST(HLSLRootSignatureTest, DefaultStaticSamplerDump) {
"minLOD = 0.000000e+00, "
"maxLOD = 3.402823e+38, "
"space = 0, "
- "visibility = All"
+ "visibility = All, "
+ "flags = None"
")";
EXPECT_EQ(Out, Expected);
}
@@ -287,6 +288,7 @@ TEST(HLSLRootSignatureTest, DefinedStaticSamplerDump) {
Sampler.MaxLOD = 32.0f;
Sampler.Space = 7;
Sampler.Visibility = llvm::dxbc::ShaderVisibility::Domain;
+ Sampler.Flags = llvm::dxbc::StaticSamplerFlags::NonNormalizedCoordinates;
std::string Out;
llvm::raw_string_ostream OS(Out);
@@ -305,7 +307,8 @@ TEST(HLSLRootSignatureTest, DefinedStaticSamplerDump) {
"minLOD = 1.000000e+00, "
"maxLOD = 3.200000e+01, "
"space = 7, "
- "visibility = Domain"
+ "visibility = Domain, "
+ "flags = NonNormalizedCoordinates"
")";
EXPECT_EQ(Out, Expected);
}
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index abe36bc..6ea951e 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -26,13 +26,14 @@
#include <initializer_list>
-LLVM_ABI extern llvm::cl::opt<float> MemProfLifetimeAccessDensityColdThreshold;
-LLVM_ABI extern llvm::cl::opt<unsigned> MemProfAveLifetimeColdThreshold;
-LLVM_ABI extern llvm::cl::opt<unsigned>
+namespace llvm {
+
+LLVM_ABI extern cl::opt<float> MemProfLifetimeAccessDensityColdThreshold;
+LLVM_ABI extern cl::opt<unsigned> MemProfAveLifetimeColdThreshold;
+LLVM_ABI extern cl::opt<unsigned>
MemProfMinAveLifetimeAccessDensityHotThreshold;
-LLVM_ABI extern llvm::cl::opt<bool> MemProfUseHotHints;
+LLVM_ABI extern cl::opt<bool> MemProfUseHotHints;
-namespace llvm {
namespace memprof {
namespace {
diff --git a/llvm/unittests/Support/Path.cpp b/llvm/unittests/Support/Path.cpp
index 888729b..eb649de 100644
--- a/llvm/unittests/Support/Path.cpp
+++ b/llvm/unittests/Support/Path.cpp
@@ -255,14 +255,14 @@ TEST(Support, Path) {
{
SmallString<32> Relative("foo.cpp");
- sys::fs::make_absolute("/root", Relative);
+ path::make_absolute("/root", Relative);
Relative[5] = '/'; // Fix up windows paths.
ASSERT_EQ("/root/foo.cpp", Relative);
}
{
SmallString<32> Relative("foo.cpp");
- sys::fs::make_absolute("//root", Relative);
+ path::make_absolute("//root", Relative);
Relative[6] = '/'; // Fix up windows paths.
ASSERT_EQ("//root/foo.cpp", Relative);
}
diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp
index c84e760..8f199b6 100644
--- a/mlir/lib/IR/Builders.cpp
+++ b/mlir/lib/IR/Builders.cpp
@@ -489,13 +489,6 @@ OpBuilder::tryFold(Operation *op, SmallVectorImpl<Value> &results,
SmallVector<OpFoldResult, 4> foldResults;
LDBG() << "Trying to fold: "
<< OpWithFlags(op, OpPrintingFlags().skipRegions());
- if (op->getName().getStringRef() == "vector.extract") {
- Operation *parent = op->getParentOp();
- while (parent && parent->getName().getStringRef() != "spirv.func")
- parent = parent->getParentOp();
- if (parent)
- parent->dump();
- }
if (failed(op->fold(foldResults)))
return cleanupFailure();
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index e57d9de..026664b 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -5336,6 +5336,7 @@ libc_support_library(
":__support_common",
":__support_cpp_bitset",
":__support_cpp_type_traits",
+ ":__support_macros_attributes",
":__support_macros_optimization",
":hdr_limits_macros",
":llvm_libc_types_size_t",